37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
758 if (ST.hasScalarAddSub64()) {
761 .clampMaxNumElementsStrict(0,
S16, 2)
769 .clampMaxNumElementsStrict(0,
S16, 2)
776 if (ST.hasScalarSMulU64()) {
779 .clampMaxNumElementsStrict(0,
S16, 2)
787 .clampMaxNumElementsStrict(0,
S16, 2)
797 .minScalarOrElt(0,
S16)
802 }
else if (ST.has16BitInsts()) {
836 .widenScalarToNextMultipleOf(0, 32)
846 if (ST.hasMad64_32())
851 if (ST.hasIntClamp()) {
874 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
884 if (ST.hasVOP3PInsts()) {
886 .clampMaxNumElements(0,
S8, 2)
907 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
926 .clampScalar(0,
S16,
S64);
959 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
971 FPOpActions.legalFor({
S16});
973 TrigActions.customFor({
S16});
974 FDIVActions.customFor({
S16});
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor({
V2S32});
979 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
982 auto &MinNumMaxNumIeee =
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(FPTypesPK16)
988 .clampMaxNumElements(0,
S16, 2)
991 }
else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
994 MinNumMaxNumIeee.legalFor(FPTypesBase)
1000 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(FPTypesPK16)
1005 .clampMaxNumElements(0,
S16, 2)
1006 .clampScalar(0,
S16,
S64)
1008 }
else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(FPTypes16)
1010 .clampScalar(0,
S16,
S64)
1013 MinNumMaxNum.customFor(FPTypesBase)
1014 .clampScalar(0,
S32,
S64)
1018 if (ST.hasVOP3PInsts())
1035 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1037 if (ST.hasPackedFP32Ops())
1041 if (ST.has16BitInsts()) {
1075 if (ST.hasFractBug()) {
1109 if (ST.hasCvtPkF16F32Inst()) {
1111 .clampMaxNumElements(0,
S16, 2);
1115 FPTruncActions.scalarize(0).lower();
1123 if (ST.has16BitInsts()) {
1137 if (ST.hasPackedFP32Ops())
1146 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1147 FMad.customFor({
S32,
S16});
1148 else if (ST.hasMadMacF32Insts())
1149 FMad.customFor({
S32});
1150 else if (ST.hasMadF16())
1151 FMad.customFor({
S16});
1156 if (ST.has16BitInsts()) {
1159 FRem.minScalar(0,
S32)
1168 .clampMaxNumElements(0,
S16, 2)
1187 if (ST.has16BitInsts())
1198 if (ST.has16BitInsts())
1212 if (ST.has16BitInsts())
1223 .clampScalar(0,
S16,
S64)
1238 .clampScalar(0,
S16,
S64)
1242 if (ST.has16BitInsts()) {
1244 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1246 .clampScalar(0,
S16,
S64)
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1252 .clampScalar(0,
S32,
S64)
1256 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1259 .clampScalar(0,
S32,
S64)
1271 .scalarSameSizeAs(1, 0)
1287 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1288 .legalForCartesianProduct(
1289 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1290 if (ST.has16BitInsts()) {
1291 CmpBuilder.legalFor({{
S1,
S16}});
1302 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1304 if (ST.hasSALUFloatInsts())
1314 if (ST.has16BitInsts())
1315 ExpOps.customFor({{
S32}, {
S16}});
1317 ExpOps.customFor({
S32});
1318 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1326 .
legalFor(ST.has16BitInsts(), {S16})
1332 .
legalFor(ST.has16BitInsts(), {S16})
1346 .clampScalar(0,
S32,
S32)
1353 if (ST.has16BitInsts())
1356 .widenScalarToNextPow2(1)
1362 .lowerFor({
S1,
S16})
1363 .widenScalarToNextPow2(1)
1390 .clampScalar(0,
S32,
S32)
1406 .clampScalar(0,
S32,
S64)
1410 if (ST.has16BitInsts()) {
1413 .clampMaxNumElementsStrict(0,
S16, 2)
1420 if (ST.hasVOP3PInsts()) {
1423 .clampMaxNumElements(0,
S16, 2)
1428 if (ST.hasIntMinMax64()) {
1431 .clampMaxNumElements(0,
S16, 2)
1439 .clampMaxNumElements(0,
S16, 2)
1448 .widenScalarToNextPow2(0)
1476 .legalForCartesianProduct(AddrSpaces32, {
S32})
1492 .legalForCartesianProduct(AddrSpaces32, {
S32})
1509 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1510 bool IsLoad) ->
bool {
1514 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1528 unsigned NumRegs = (MemSize + 31) / 32;
1530 if (!ST.hasDwordx3LoadStores())
1541 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1542 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1543 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1549 for (
unsigned Op : {G_LOAD, G_STORE}) {
1550 const bool IsStore =
Op == G_STORE;
1555 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1558 {
S64, GlobalPtr,
S64, GlobalAlign32},
1561 {
S32, GlobalPtr,
S8, GlobalAlign8},
1562 {
S32, GlobalPtr,
S16, GlobalAlign16},
1564 {
S32, LocalPtr,
S32, 32},
1565 {
S64, LocalPtr,
S64, 32},
1567 {
S32, LocalPtr,
S8, 8},
1568 {
S32, LocalPtr,
S16, 16},
1571 {
S32, PrivatePtr,
S32, 32},
1572 {
S32, PrivatePtr,
S8, 8},
1573 {
S32, PrivatePtr,
S16, 16},
1576 {
S32, ConstantPtr,
S32, GlobalAlign32},
1579 {
S64, ConstantPtr,
S64, GlobalAlign32},
1580 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1589 Actions.unsupportedIf(
1590 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1604 Actions.customIf(
typeIs(1, Constant32Ptr));
1630 return !Query.
Types[0].isVector() &&
1631 needToSplitMemOp(Query,
Op == G_LOAD);
1633 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1638 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1641 if (DstSize > MemSize)
1647 if (MemSize > MaxSize)
1655 return Query.
Types[0].isVector() &&
1656 needToSplitMemOp(Query,
Op == G_LOAD);
1658 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1672 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1673 if (MemSize > MaxSize) {
1677 if (MaxSize % EltSize == 0) {
1683 unsigned NumPieces = MemSize / MaxSize;
1687 if (NumPieces == 1 || NumPieces >= NumElts ||
1688 NumElts % NumPieces != 0)
1689 return std::pair(0, EltTy);
1697 return std::pair(0, EltTy);
1712 return std::pair(0, EltTy);
1717 .widenScalarToNextPow2(0)
1724 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1725 {
S32, GlobalPtr,
S16, 2 * 8},
1726 {
S32, LocalPtr,
S8, 8},
1727 {
S32, LocalPtr,
S16, 16},
1728 {
S32, PrivatePtr,
S8, 8},
1729 {
S32, PrivatePtr,
S16, 16},
1730 {
S32, ConstantPtr,
S8, 8},
1731 {
S32, ConstantPtr,
S16, 2 * 8}})
1737 if (ST.hasFlatAddressSpace()) {
1738 ExtLoads.legalForTypesWithMemDesc(
1739 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1754 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1755 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1756 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1757 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1758 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1759 {
S64, GlobalPtr}, {
S64, LocalPtr},
1760 {
S32, RegionPtr}, {
S64, RegionPtr}});
1761 if (ST.hasFlatAddressSpace()) {
1762 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1767 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1768 if (ST.hasFlatAddressSpace()) {
1769 Atomics32.legalFor({{
S32, FlatPtr}});
1774 if (ST.hasLDSFPAtomicAddF32()) {
1775 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1776 if (ST.hasLdsAtomicAddF64())
1777 Atomic.legalFor({{
S64, LocalPtr}});
1778 if (ST.hasAtomicDsPkAdd16Insts())
1779 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1781 if (ST.hasAtomicFaddInsts())
1782 Atomic.legalFor({{
S32, GlobalPtr}});
1783 if (ST.hasFlatAtomicFaddF32Inst())
1784 Atomic.legalFor({{
S32, FlatPtr}});
1786 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1797 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1798 ST.hasAtomicBufferGlobalPkAddF16Insts())
1799 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1800 if (ST.hasAtomicGlobalPkAddBF16Inst())
1801 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1802 if (ST.hasAtomicFlatPkAdd16Insts())
1803 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1808 auto &AtomicFMinFMax =
1810 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1812 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1814 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1815 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1816 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1818 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1825 {
S32, FlatPtr}, {
S64, FlatPtr}})
1826 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1827 {
S32, RegionPtr}, {
S64, RegionPtr}});
1833 LocalPtr, FlatPtr, PrivatePtr,
1837 .clampScalar(0,
S16,
S64)
1852 if (ST.has16BitInsts()) {
1853 if (ST.hasVOP3PInsts()) {
1855 .clampMaxNumElements(0,
S16, 2);
1857 Shifts.legalFor({{
S16,
S16}});
1860 Shifts.widenScalarIf(
1865 const LLT AmountTy = Query.
Types[1];
1870 Shifts.clampScalar(1,
S32,
S32);
1871 Shifts.widenScalarToNextPow2(0, 16);
1872 Shifts.clampScalar(0,
S16,
S64);
1882 Shifts.clampScalar(1,
S32,
S32);
1883 Shifts.widenScalarToNextPow2(0, 32);
1884 Shifts.clampScalar(0,
S32,
S64);
1893 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1894 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1895 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1896 unsigned IdxTypeIdx = 2;
1900 const LLT EltTy = Query.
Types[EltTypeIdx];
1901 const LLT VecTy = Query.
Types[VecTypeIdx];
1902 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1904 const bool isLegalVecType =
1914 return (EltSize == 32 || EltSize == 64) &&
1930 const LLT EltTy = Query.
Types[EltTypeIdx];
1931 const LLT VecTy = Query.
Types[VecTypeIdx];
1935 const unsigned TargetEltSize =
1936 DstEltSize % 64 == 0 ? 64 : 32;
1937 return std::pair(VecTypeIdx,
1941 .clampScalar(EltTypeIdx,
S32,
S64)
1955 const LLT &EltTy = Query.
Types[1].getElementType();
1956 return Query.
Types[0] != EltTy;
1959 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1960 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1961 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1965 const LLT BigTy = Query.
Types[BigTyIdx];
1971 const LLT LitTy = Query.
Types[LitTyIdx];
1984 const LLT BigTy = Query.
Types[BigTyIdx];
1985 const LLT LitTy = Query.
Types[LitTyIdx];
2002 if (ST.hasScalarPackInsts()) {
2005 .minScalarOrElt(0,
S16)
2012 BuildVector.customFor({
V2S16,
S16});
2013 BuildVector.minScalarOrElt(0,
S32);
2032 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2033 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2034 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2036 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2037 const LLT Ty = Query.
Types[TypeIdx];
2038 if (Ty.isVector()) {
2053 const LLT BigTy = Query.
Types[BigTyIdx];
2073 return notValidElt(Query, LitTyIdx);
2078 return notValidElt(Query, BigTyIdx);
2083 if (
Op == G_MERGE_VALUES) {
2084 Builder.widenScalarIf(
2087 const LLT Ty = Query.
Types[LitTyIdx];
2088 return Ty.getSizeInBits() < 32;
2095 const LLT Ty = Query.
Types[BigTyIdx];
2096 return Ty.getSizeInBits() % 16 != 0;
2101 const LLT &Ty = Query.
Types[BigTyIdx];
2102 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2103 if (NewSizeInBits >= 256) {
2104 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2105 if (RoundedTo < NewSizeInBits)
2106 NewSizeInBits = RoundedTo;
2108 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2119 .clampScalar(0,
S32,
S64);
2121 if (ST.hasVOP3PInsts()) {
2122 SextInReg.lowerFor({{
V2S16}})
2126 .clampMaxNumElementsStrict(0,
S16, 2);
2127 }
else if (ST.has16BitInsts()) {
2128 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2132 SextInReg.lowerFor({{
S32}, {
S64}});
2145 FSHRActionDefs.legalFor({{
S32,
S32}})
2146 .clampMaxNumElementsStrict(0,
S16, 2);
2147 if (ST.hasVOP3PInsts())
2149 FSHRActionDefs.scalarize(0).lower();
2151 if (ST.hasVOP3PInsts()) {
2154 .clampMaxNumElementsStrict(0,
S16, 2)
2178 .clampScalar(1,
S32,
S32)
2187 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2188 G_READ_REGISTER, G_WRITE_REGISTER,
2193 if (ST.hasIEEEMinimumMaximumInsts()) {
2195 .legalFor(FPTypesPK16)
2198 }
else if (ST.hasVOP3PInsts()) {
2201 .clampMaxNumElementsStrict(0,
S16, 2)
2217 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2218 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2224 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2225 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2226 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2227 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2233 verify(*ST.getInstrInfo());
2242 switch (
MI.getOpcode()) {
2243 case TargetOpcode::G_ADDRSPACE_CAST:
2245 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2247 case TargetOpcode::G_FCEIL:
2249 case TargetOpcode::G_FREM:
2251 case TargetOpcode::G_INTRINSIC_TRUNC:
2253 case TargetOpcode::G_SITOFP:
2255 case TargetOpcode::G_UITOFP:
2257 case TargetOpcode::G_FPTOSI:
2259 case TargetOpcode::G_FPTOUI:
2261 case TargetOpcode::G_FMINNUM:
2262 case TargetOpcode::G_FMAXNUM:
2263 case TargetOpcode::G_FMINIMUMNUM:
2264 case TargetOpcode::G_FMAXIMUMNUM:
2266 case TargetOpcode::G_EXTRACT:
2268 case TargetOpcode::G_INSERT:
2270 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2272 case TargetOpcode::G_INSERT_VECTOR_ELT:
2274 case TargetOpcode::G_FSIN:
2275 case TargetOpcode::G_FCOS:
2277 case TargetOpcode::G_GLOBAL_VALUE:
2279 case TargetOpcode::G_LOAD:
2280 case TargetOpcode::G_SEXTLOAD:
2281 case TargetOpcode::G_ZEXTLOAD:
2283 case TargetOpcode::G_STORE:
2285 case TargetOpcode::G_FMAD:
2287 case TargetOpcode::G_FDIV:
2289 case TargetOpcode::G_FFREXP:
2291 case TargetOpcode::G_FSQRT:
2293 case TargetOpcode::G_UDIV:
2294 case TargetOpcode::G_UREM:
2295 case TargetOpcode::G_UDIVREM:
2297 case TargetOpcode::G_SDIV:
2298 case TargetOpcode::G_SREM:
2299 case TargetOpcode::G_SDIVREM:
2301 case TargetOpcode::G_ATOMIC_CMPXCHG:
2303 case TargetOpcode::G_FLOG2:
2305 case TargetOpcode::G_FLOG:
2306 case TargetOpcode::G_FLOG10:
2308 case TargetOpcode::G_FEXP2:
2310 case TargetOpcode::G_FEXP:
2311 case TargetOpcode::G_FEXP10:
2313 case TargetOpcode::G_FPOW:
2315 case TargetOpcode::G_FFLOOR:
2317 case TargetOpcode::G_BUILD_VECTOR:
2318 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2320 case TargetOpcode::G_MUL:
2322 case TargetOpcode::G_CTLZ:
2323 case TargetOpcode::G_CTTZ:
2325 case TargetOpcode::G_CTLS:
2327 case TargetOpcode::G_CTLZ_ZERO_POISON:
2329 case TargetOpcode::G_STACKSAVE:
2331 case TargetOpcode::G_GET_FPENV:
2333 case TargetOpcode::G_SET_FPENV:
2335 case TargetOpcode::G_TRAP:
2337 case TargetOpcode::G_DEBUGTRAP:
2357 if (ST.hasApertureRegs()) {
2362 ? AMDGPU::SRC_SHARED_BASE
2363 : AMDGPU::SRC_PRIVATE_BASE;
2364 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2365 !ST.hasGloballyAddressableScratch()) &&
2366 "Cannot use src_private_base with globally addressable scratch!");
2369 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2370 return B.buildUnmerge(
S32, Dst).getReg(1);
2385 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2401 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2404 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2426 B.buildObjectPtrOffset(
2428 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2429 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2437 switch (Def->getOpcode()) {
2438 case AMDGPU::G_FRAME_INDEX:
2439 case AMDGPU::G_GLOBAL_VALUE:
2440 case AMDGPU::G_BLOCK_ADDR:
2442 case AMDGPU::G_CONSTANT: {
2443 const ConstantInt *CI = Def->getOperand(1).getCImm();
2460 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2462 Intrinsic::amdgcn_addrspacecast_nonnull));
2467 :
MI.getOperand(1).getReg();
2471 unsigned SrcAS = SrcTy.getAddressSpace();
2481 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2488 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2490 ST.hasGloballyAddressableScratch()) {
2494 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2496 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2497 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2499 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2501 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2505 return B.buildExtract(Dst, Src, 0).getReg(0);
2511 castFlatToLocalOrPrivate(Dst);
2512 MI.eraseFromParent();
2518 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2519 auto FlatNull =
B.buildConstant(SrcTy, 0);
2522 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2526 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2528 MI.eraseFromParent();
2535 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2538 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2541 ST.hasGloballyAddressableScratch()) {
2546 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2550 if (ST.isWave64()) {
2551 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2557 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2558 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2560 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2564 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2565 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2567 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2568 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2577 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2583 castLocalOrPrivateToFlat(Dst);
2584 MI.eraseFromParent();
2588 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2595 SegmentNull.getReg(0));
2597 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2599 MI.eraseFromParent();
2604 SrcTy.getSizeInBits() == 64) {
2606 B.buildExtract(Dst, Src, 0);
2607 MI.eraseFromParent();
2614 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2615 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2616 if (AddrHiVal == 0) {
2618 B.buildIntToPtr(Dst, Zext);
2620 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2621 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2624 MI.eraseFromParent();
2631 MI.eraseFromParent();
2640 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2645 auto C1 =
B.buildFConstant(Ty, C1Val);
2646 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2649 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2650 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2652 auto C2 =
B.buildFConstant(Ty, C2Val);
2653 auto Fabs =
B.buildFAbs(Ty, Src);
2656 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2657 MI.eraseFromParent();
2675 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2677 const auto Zero =
B.buildFConstant(
S64, 0.0);
2678 const auto One =
B.buildFConstant(
S64, 1.0);
2681 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2682 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2685 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2686 MI.eraseFromParent();
2694 Register Src0Reg =
MI.getOperand(1).getReg();
2695 Register Src1Reg =
MI.getOperand(2).getReg();
2696 auto Flags =
MI.getFlags();
2699 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2700 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2701 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2702 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2703 MI.eraseFromParent();
2709 const unsigned FractBits = 52;
2710 const unsigned ExpBits = 11;
2713 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2714 auto Const1 =
B.buildConstant(
S32, ExpBits);
2716 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2718 .addUse(Const0.getReg(0))
2719 .addUse(Const1.getReg(0));
2721 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2735 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2742 const unsigned FractBits = 52;
2745 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2746 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2748 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2750 const auto Zero32 =
B.buildConstant(
S32, 0);
2753 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2755 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2756 auto Not =
B.buildNot(
S64, Shr);
2757 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2758 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2763 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2764 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2765 MI.eraseFromParent();
2781 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2782 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2785 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2786 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2788 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2789 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2792 B.buildFAdd(Dst, LdExp, CvtLo);
2793 MI.eraseFromParent();
2799 auto One =
B.buildConstant(
S32, 1);
2803 auto ThirtyOne =
B.buildConstant(
S32, 31);
2804 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2805 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2806 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2807 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2808 .addUse(Unmerge.getReg(1));
2809 auto LS2 =
B.buildSub(
S32, LS, One);
2810 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2812 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2813 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2814 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2815 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2816 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2817 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2818 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2819 B.buildFLdexp(Dst, FVal, Scale);
2820 MI.eraseFromParent();
2840 unsigned Flags =
MI.getFlags();
2851 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2859 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2860 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2864 K0 =
B.buildFConstant(
2866 K1 =
B.buildFConstant(
2869 K0 =
B.buildFConstant(
2871 K1 =
B.buildFConstant(
2875 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2876 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2877 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2880 :
B.buildFPTOUI(
S32, FloorMul);
2881 auto Lo =
B.buildFPTOUI(
S32, Fma);
2885 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2887 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2890 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2891 MI.eraseFromParent();
2923 unsigned StartIdx =
Offset / 32;
2925 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2927 if (DstCount == 1) {
2929 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2934 for (
unsigned I = 0;
I < DstCount; ++
I)
2935 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2936 B.buildMergeLikeInstr(DstReg, MergeVec);
2939 MI.eraseFromParent();
2949 Register InsertSrc =
MI.getOperand(2).getReg();
2958 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2962 unsigned DstCount = DstSize / 32;
2963 unsigned InsertCount = InsertSize / 32;
2964 unsigned StartIdx =
Offset / 32;
2966 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
2969 for (
unsigned I = 0;
I < StartIdx; ++
I)
2972 if (InsertCount == 1) {
2976 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
2979 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
2980 for (
unsigned I = 0;
I < InsertCount; ++
I)
2984 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
2987 B.buildMergeLikeInstr(DstReg, MergeVec);
2989 MI.eraseFromParent();
3016 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3017 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3018 B.buildIntToPtr(Dst, IntElt);
3020 MI.eraseFromParent();
3027 std::optional<ValueAndVReg> MaybeIdxVal =
3031 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3034 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3035 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3040 MI.eraseFromParent();
3069 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3070 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3071 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3073 B.buildIntToPtr(Dst, IntVecDest);
3074 MI.eraseFromParent();
3081 std::optional<ValueAndVReg> MaybeIdxVal =
3086 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3089 if (IdxVal < NumElts) {
3091 for (
unsigned i = 0; i < NumElts; ++i)
3093 B.buildUnmerge(SrcRegs, Vec);
3095 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3096 B.buildMergeLikeInstr(Dst, SrcRegs);
3101 MI.eraseFromParent();
3112 unsigned Flags =
MI.getFlags();
3116 if (ST.hasTrigReducedRange()) {
3117 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3118 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3119 .addUse(MulVal.getReg(0))
3123 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3126 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3130 MI.eraseFromParent();
3138 unsigned GAFlags)
const {
3167 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3169 if (ST.has64BitLiterals()) {
3173 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3177 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3186 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3187 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3190 B.buildExtract(DstReg, PCReg, 0);
3200 if (RequiresHighHalf && ST.has64BitLiterals()) {
3202 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3203 B.buildInstr(AMDGPU::S_MOV_B64)
3218 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3221 B.buildInstr(AMDGPU::S_MOV_B32)
3226 if (RequiresHighHalf) {
3228 "Must provide a 64-bit pointer type!");
3231 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3233 B.buildInstr(AMDGPU::S_MOV_B32)
3244 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3246 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3250 if (AddrDst != DstReg)
3251 B.buildCast(DstReg, AddrDst);
3252 }
else if (AddrLo != DstReg) {
3255 B.buildCast(DstReg, AddrLo);
3272 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3276 Fn,
"local memory global used by non-kernel function",
3285 B.buildUndef(DstReg);
3286 MI.eraseFromParent();
3310 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3311 B.buildIntToPtr(DstReg, Sz);
3312 MI.eraseFromParent();
3318 MI.eraseFromParent();
3322 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3324 MI.eraseFromParent();
3332 MI.eraseFromParent();
3338 MI.eraseFromParent();
3354 if (Ty.getSizeInBits() == 32) {
3356 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3357 B.buildExtract(DstReg, Load, 0);
3359 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3361 MI.eraseFromParent();
3384 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3386 MI.getOperand(1).setReg(Cast.getReg(0));
3391 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3417 if (WideMemSize == ValSize) {
3423 MI.setMemRefs(MF, {WideMMO});
3429 if (ValSize > WideMemSize)
3436 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3437 B.buildTrunc(ValReg, WideLoad).getReg(0);
3444 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3445 B.buildExtract(ValReg, WideLoad, 0);
3449 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3450 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3454 MI.eraseFromParent();
3467 Register DataReg =
MI.getOperand(0).getReg();
3512 "this should not have been custom lowered");
3517 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3519 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3523 .setMemRefs(
MI.memoperands());
3525 MI.eraseFromParent();
3533 switch (
DefMI->getOpcode()) {
3534 case TargetOpcode::G_INTRINSIC: {
3536 case Intrinsic::amdgcn_frexp_mant:
3537 case Intrinsic::amdgcn_log:
3538 case Intrinsic::amdgcn_log_clamp:
3539 case Intrinsic::amdgcn_exp2:
3540 case Intrinsic::amdgcn_sqrt:
3548 case TargetOpcode::G_FSQRT:
3550 case TargetOpcode::G_FFREXP: {
3551 if (
DefMI->getOperand(0).getReg() == Src)
3555 case TargetOpcode::G_FPEXT: {
3576std::pair<Register, Register>
3578 unsigned Flags)
const {
3583 auto SmallestNormal =
B.buildFConstant(
3585 auto IsLtSmallestNormal =
3588 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3589 auto One =
B.buildFConstant(
F32, 1.0);
3591 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3592 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3594 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3607 LLT Ty =
B.getMRI()->getType(Dst);
3608 unsigned Flags =
MI.getFlags();
3613 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3614 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3615 .addUse(Ext.getReg(0))
3617 B.buildFPTrunc(Dst,
Log2, Flags);
3618 MI.eraseFromParent();
3626 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3629 MI.eraseFromParent();
3633 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3634 .addUse(ScaledInput)
3637 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3638 auto Zero =
B.buildFConstant(Ty, 0.0);
3640 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3641 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3643 MI.eraseFromParent();
3649 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3650 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3655 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3656 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3661 unsigned Flags =
MI.getFlags();
3674 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3676 B.buildFPTrunc(Dst, LogVal);
3681 MI.eraseFromParent();
3690 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3693 if (ST.hasFastFMAF32()) {
3695 const float c_log10 = 0x1.344134p-2f;
3696 const float cc_log10 = 0x1.09f79ep-26f;
3699 const float c_log = 0x1.62e42ep-1f;
3700 const float cc_log = 0x1.efa39ep-25f;
3702 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3703 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3707 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3708 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3709 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3710 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3711 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3714 const float ch_log10 = 0x1.344000p-2f;
3715 const float ct_log10 = 0x1.3509f6p-18f;
3718 const float ch_log = 0x1.62e000p-1f;
3719 const float ct_log = 0x1.0bfbe8p-15f;
3721 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3722 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3724 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3725 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3726 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3730 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3733 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3735 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3738 const bool IsFiniteOnly =
3741 if (!IsFiniteOnly) {
3744 auto Fabs =
B.buildFAbs(Ty,
Y);
3747 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3751 auto Zero =
B.buildFConstant(Ty, 0.0);
3753 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3754 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3755 B.buildFSub(Dst, R, Shift, Flags);
3757 B.buildCopy(Dst, R);
3760 MI.eraseFromParent();
3766 unsigned Flags)
const {
3767 const double Log2BaseInverted =
3770 LLT Ty =
B.getMRI()->getType(Dst);
3775 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3778 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3779 auto Zero =
B.buildFConstant(Ty, 0.0);
3781 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3782 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3784 if (ST.hasFastFMAF32())
3785 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3787 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3788 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3796 ?
B.buildFLog2(Ty, Src, Flags)
3797 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3800 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3801 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3812 unsigned Flags =
MI.getFlags();
3813 LLT Ty =
B.getMRI()->getType(Dst);
3823 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3824 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3825 .addUse(Ext.getReg(0))
3827 B.buildFPTrunc(Dst,
Log2, Flags);
3828 MI.eraseFromParent();
3838 MI.eraseFromParent();
3846 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3848 RangeCheckConst, Flags);
3850 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3851 auto Zero =
B.buildFConstant(Ty, 0.0);
3852 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3853 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3855 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3856 .addUse(AddInput.getReg(0))
3859 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3860 auto One =
B.buildFConstant(Ty, 1.0);
3861 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3862 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3863 MI.eraseFromParent();
3868 const SrcOp &Src,
unsigned Flags) {
3869 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3872 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3873 .addUse(Src.getReg())
3876 return B.buildFExp2(Dst, Src, Flags);
3882 bool IsExp10)
const {
3883 LLT Ty =
B.getMRI()->getType(
X);
3887 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3888 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3895 LLT Ty =
B.getMRI()->getType(Dst);
3902 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3905 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3906 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3907 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3910 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3912 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3913 .addUse(ExpInput.getReg(0))
3916 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3917 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3918 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3924 unsigned Flags)
const {
3925 LLT Ty =
B.getMRI()->getType(Dst);
3930 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3931 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3933 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3934 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3935 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3936 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3937 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3947 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3951 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3952 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3953 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3955 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3956 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3958 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3959 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3960 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3961 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3963 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3964 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3965 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3967 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3986 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
3988 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
3990 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
3992 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
3993 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
3994 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
3995 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
3997 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
3998 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
3999 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4000 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4002 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4003 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4004 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4005 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4006 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4008 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4009 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4010 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4011 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4014 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4015 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4016 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4018 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4019 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4020 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4021 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4022 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4026 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4027 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4029 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4031 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4033 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4035 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4037 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4038 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4039 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4040 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4042 auto One =
B.buildFConstant(
S64, 1.0);
4043 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4044 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4047 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4048 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4055 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4062 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4064 MI.eraseFromParent();
4072 const unsigned Flags =
MI.getFlags();
4084 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4092 MI.eraseFromParent();
4103 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4106 B.buildFPTrunc(Dst, Lowered, Flags);
4107 MI.eraseFromParent();
4118 MI.eraseFromParent();
4146 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4149 if (ST.hasFastFMAF32()) {
4151 const float cc_exp = 0x1.4ae0bep-26f;
4152 const float c_exp10 = 0x1.a934f0p+1f;
4153 const float cc_exp10 = 0x1.2f346ep-24f;
4155 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4156 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4157 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4158 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4160 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4161 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4163 const float ch_exp = 0x1.714000p+0f;
4164 const float cl_exp = 0x1.47652ap-12f;
4166 const float ch_exp10 = 0x1.a92000p+1f;
4167 const float cl_exp10 = 0x1.4f0978p-11f;
4169 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4170 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4171 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4173 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4174 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4176 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4177 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4180 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4181 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4184 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4187 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4188 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4191 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4192 .addUse(
A.getReg(0))
4194 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4196 auto UnderflowCheckConst =
4197 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4198 auto Zero =
B.buildFConstant(Ty, 0.0);
4202 R =
B.buildSelect(Ty, Underflow, Zero, R);
4205 auto OverflowCheckConst =
4206 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4211 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4214 B.buildCopy(Dst, R);
4215 MI.eraseFromParent();
4224 unsigned Flags =
MI.getFlags();
4225 LLT Ty =
B.getMRI()->getType(Dst);
4230 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4231 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4232 .addUse(Log.getReg(0))
4235 B.buildFExp2(Dst,
Mul, Flags);
4236 }
else if (Ty == F16) {
4238 auto Log =
B.buildFLog2(F16, Src0, Flags);
4239 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4240 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4241 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4242 .addUse(Ext0.getReg(0))
4243 .addUse(Ext1.getReg(0))
4245 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4249 MI.eraseFromParent();
4257 ModSrc = SrcFNeg->getOperand(1).getReg();
4259 ModSrc = SrcFAbs->getOperand(1).getReg();
4261 ModSrc = SrcFAbs->getOperand(1).getReg();
4272 Register OrigSrc =
MI.getOperand(1).getReg();
4273 unsigned Flags =
MI.getFlags();
4275 "this should not have been custom lowered");
4285 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4305 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4307 B.buildFMinNum(Min, Fract, Const, Flags);
4312 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4315 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4316 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4318 MI.eraseFromParent();
4334 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4336 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4337 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4340 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4341 B.buildBitcast(Dst,
Merge);
4343 MI.eraseFromParent();
4360 bool UsePartialMad64_32,
4361 bool SeparateOddAlignedProducts)
const {
4376 auto getZero32 = [&]() ->
Register {
4378 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4381 auto getZero64 = [&]() ->
Register {
4383 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4388 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4399 if (CarryIn.empty())
4402 bool HaveCarryOut =
true;
4404 if (CarryIn.size() == 1) {
4406 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4410 CarryAccum = getZero32();
4412 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4413 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4415 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4420 LocalAccum = getZero32();
4421 HaveCarryOut =
false;
4426 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4427 LocalAccum =
Add.getReg(0);
4441 auto buildMadChain =
4444 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4445 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4452 if (LocalAccum.size() == 1 &&
4453 (!UsePartialMad64_32 || !CarryIn.empty())) {
4456 unsigned j1 = DstIndex - j0;
4457 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4461 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4463 LocalAccum[0] =
Mul.getReg(0);
4465 if (CarryIn.empty()) {
4466 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4469 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4475 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4479 if (j0 <= DstIndex) {
4480 bool HaveSmallAccum =
false;
4483 if (LocalAccum[0]) {
4484 if (LocalAccum.size() == 1) {
4485 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4486 HaveSmallAccum =
true;
4487 }
else if (LocalAccum[1]) {
4488 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4489 HaveSmallAccum =
false;
4491 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4492 HaveSmallAccum =
true;
4495 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4497 HaveSmallAccum =
true;
4501 unsigned j1 = DstIndex - j0;
4502 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4506 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4507 {Src0[j0], Src1[j1], Tmp});
4508 Tmp = Mad.getReg(0);
4509 if (!HaveSmallAccum)
4510 CarryOut.push_back(Mad.getReg(1));
4511 HaveSmallAccum =
false;
4514 }
while (j0 <= DstIndex);
4516 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4517 LocalAccum[0] = Unmerge.getReg(0);
4518 if (LocalAccum.size() > 1)
4519 LocalAccum[1] = Unmerge.getReg(1);
4546 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4547 Carry OddCarryIn = std::move(OddCarry);
4548 Carry EvenCarryIn = std::move(EvenCarry);
4553 if (2 * i < Accum.
size()) {
4554 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4555 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4560 if (!SeparateOddAlignedProducts) {
4561 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4562 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4564 bool IsHighest = 2 * i >= Accum.
size();
4567 .take_front(IsHighest ? 1 : 2);
4568 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4574 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4576 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4578 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4581 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4584 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4585 Lo->getOperand(1).getReg());
4586 Accum[2 * i] =
Hi.getReg(0);
4587 SeparateOddCarry =
Hi.getReg(1);
4594 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4595 EvenCarryIn.push_back(CarryOut);
4597 if (2 * i < Accum.
size()) {
4598 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4599 OddCarry.push_back(CarryOut);
4611 assert(ST.hasMad64_32());
4612 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4624 unsigned Size = Ty.getSizeInBits();
4625 if (ST.hasVectorMulU64() &&
Size == 64)
4628 unsigned NumParts =
Size / 32;
4640 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4644 for (
unsigned i = 0; i < NumParts; ++i) {
4648 B.buildUnmerge(Src0Parts, Src0);
4649 B.buildUnmerge(Src1Parts, Src1);
4652 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4653 SeparateOddAlignedProducts);
4655 B.buildMergeLikeInstr(DstReg, AccumRegs);
4656 MI.eraseFromParent();
4671 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4672 ? AMDGPU::G_AMDGPU_FFBH_U32
4673 : AMDGPU::G_AMDGPU_FFBL_B32;
4674 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4677 MI.eraseFromParent();
4687 TypeSize NumBits = SrcTy.getSizeInBits();
4691 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4692 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4693 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4694 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4695 B.buildTrunc(Dst, Ctlz);
4696 MI.eraseFromParent();
4707 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4708 unsigned BitWidth = SrcTy.getSizeInBits();
4710 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4712 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4713 MI.eraseFromParent();
4719 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4722 return ConstVal == -1;
4729 Register CondDef =
MI.getOperand(0).getReg();
4748 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4757 UncondBrTarget = &*NextMBB;
4759 if (
Next->getOpcode() != AMDGPU::G_BR)
4778 *ArgRC,
B.getDebugLoc(), ArgTy);
4782 const unsigned Mask = Arg->
getMask();
4790 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4791 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4794 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4796 B.buildCopy(DstReg, LiveIn);
4806 if (!ST.hasClusters()) {
4809 MI.eraseFromParent();
4829 auto One =
B.buildConstant(
S32, 1);
4830 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4831 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4832 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4839 B.buildCopy(DstReg, GlobalIdXYZ);
4840 MI.eraseFromParent();
4844 B.buildCopy(DstReg, ClusterIdXYZ);
4845 MI.eraseFromParent();
4850 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4852 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4853 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4855 .addImm(ClusterIdField);
4856 auto Zero =
B.buildConstant(
S32, 0);
4859 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4860 MI.eraseFromParent();
4902 auto LoadConstant = [&](
unsigned N) {
4903 B.buildConstant(DstReg,
N);
4907 if (ST.hasArchitectedSGPRs() &&
4914 Arg = &WorkGroupIDX;
4915 ArgRC = &AMDGPU::SReg_32RegClass;
4919 Arg = &WorkGroupIDY;
4920 ArgRC = &AMDGPU::SReg_32RegClass;
4924 Arg = &WorkGroupIDZ;
4925 ArgRC = &AMDGPU::SReg_32RegClass;
4929 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4930 return LoadConstant(0);
4931 Arg = &ClusterWorkGroupIDX;
4932 ArgRC = &AMDGPU::SReg_32RegClass;
4936 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4937 return LoadConstant(0);
4938 Arg = &ClusterWorkGroupIDY;
4939 ArgRC = &AMDGPU::SReg_32RegClass;
4943 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4944 return LoadConstant(0);
4945 Arg = &ClusterWorkGroupIDZ;
4946 ArgRC = &AMDGPU::SReg_32RegClass;
4951 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4952 Arg = &ClusterWorkGroupMaxIDX;
4953 ArgRC = &AMDGPU::SReg_32RegClass;
4958 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4959 Arg = &ClusterWorkGroupMaxIDY;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4965 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4966 Arg = &ClusterWorkGroupMaxIDZ;
4967 ArgRC = &AMDGPU::SReg_32RegClass;
4971 Arg = &ClusterWorkGroupMaxFlatID;
4972 ArgRC = &AMDGPU::SReg_32RegClass;
4987 return LoadConstant(0);
4992 B.buildUndef(DstReg);
4996 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5008 MI.eraseFromParent();
5014 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5015 MI.eraseFromParent();
5022 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5036 B.buildUndef(DstReg);
5037 MI.eraseFromParent();
5041 if (Arg->isMasked()) {
5055 MI.eraseFromParent();
5070 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5079 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5087 Align Alignment)
const {
5091 "unexpected kernarg parameter type");
5098 MI.eraseFromParent();
5133 auto FloatY =
B.buildUITOFP(
S32,
Y);
5134 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5136 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5137 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5140 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5141 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5142 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5145 auto Q =
B.buildUMulH(
S32,
X, Z);
5146 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5149 auto One =
B.buildConstant(
S32, 1);
5152 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5158 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5161 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5180 auto Unmerge =
B.buildUnmerge(
S32, Val);
5182 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5183 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5185 auto Mad =
B.buildFMAD(
5189 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5190 auto Mul1 =
B.buildFMul(
5194 auto Mul2 =
B.buildFMul(
5196 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5199 auto Mad2 =
B.buildFMAD(
5203 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5204 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5206 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5221 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5223 auto Zero64 =
B.buildConstant(
S64, 0);
5224 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5226 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5227 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5229 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5230 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5231 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5233 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5234 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5235 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5237 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5238 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5239 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5240 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5241 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5243 auto Zero32 =
B.buildConstant(
S32, 0);
5244 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5245 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5246 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5248 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5249 Register NumerLo = UnmergeNumer.getReg(0);
5250 Register NumerHi = UnmergeNumer.getReg(1);
5252 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5253 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5254 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5255 Register Mul3_Lo = UnmergeMul3.getReg(0);
5256 Register Mul3_Hi = UnmergeMul3.getReg(1);
5257 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5258 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5259 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5260 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5262 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5263 Register DenomLo = UnmergeDenom.getReg(0);
5264 Register DenomHi = UnmergeDenom.getReg(1);
5267 auto C1 =
B.buildSExt(
S32, CmpHi);
5270 auto C2 =
B.buildSExt(
S32, CmpLo);
5273 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5280 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5281 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5282 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5283 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5285 auto One64 =
B.buildConstant(
S64, 1);
5286 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5292 auto C6 =
B.buildSelect(
5296 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5297 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5299 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5300 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5301 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5307 auto Sel1 =
B.buildSelect(
5314 auto Sel2 =
B.buildSelect(
5325 switch (
MI.getOpcode()) {
5328 case AMDGPU::G_UDIV: {
5329 DstDivReg =
MI.getOperand(0).getReg();
5332 case AMDGPU::G_UREM: {
5333 DstRemReg =
MI.getOperand(0).getReg();
5336 case AMDGPU::G_UDIVREM: {
5337 DstDivReg =
MI.getOperand(0).getReg();
5338 DstRemReg =
MI.getOperand(1).getReg();
5345 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5346 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5347 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5357 MI.eraseFromParent();
5368 if (Ty !=
S32 && Ty !=
S64)
5371 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5372 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5373 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5375 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5376 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5377 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5379 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5380 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5382 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5383 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5385 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5386 switch (
MI.getOpcode()) {
5389 case AMDGPU::G_SDIV: {
5390 DstDivReg =
MI.getOperand(0).getReg();
5394 case AMDGPU::G_SREM: {
5395 DstRemReg =
MI.getOperand(0).getReg();
5399 case AMDGPU::G_SDIVREM: {
5400 DstDivReg =
MI.getOperand(0).getReg();
5401 DstRemReg =
MI.getOperand(1).getReg();
5414 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5415 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5416 B.buildSub(DstDivReg, SignXor, Sign);
5420 auto Sign = LHSign.getReg(0);
5421 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5422 B.buildSub(DstRemReg, SignXor, Sign);
5425 MI.eraseFromParent();
5441 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5452 if (CLHS->isExactlyValue(1.0)) {
5453 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5457 MI.eraseFromParent();
5462 if (CLHS->isExactlyValue(-1.0)) {
5463 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5464 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5465 .addUse(FNeg.getReg(0))
5468 MI.eraseFromParent();
5475 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5480 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5483 B.buildFMul(Res, LHS, RCP, Flags);
5485 MI.eraseFromParent();
5500 if (!AllowInaccurateRcp)
5508 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5510 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5511 auto One =
B.buildFConstant(ResTy, 1.0);
5513 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5517 R =
B.buildFNeg(ResTy, R);
5519 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5520 R =
B.buildFMA(ResTy, Tmp0, R, R);
5522 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5523 R =
B.buildFMA(ResTy, Tmp1, R, R);
5527 B.buildCopy(Res, R);
5528 MI.eraseFromParent();
5532 auto Ret =
B.buildFMul(ResTy,
X, R);
5533 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5535 B.buildFMA(Res, Tmp2, R, Ret);
5536 MI.eraseFromParent();
5568 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5569 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5570 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5571 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5572 .addUse(RHSExt.getReg(0))
5574 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5576 if (ST.hasMadMacF32Insts()) {
5577 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5578 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5579 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5581 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5582 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5583 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5585 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5586 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5587 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5588 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5589 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5590 .addUse(RDst.getReg(0))
5595 MI.eraseFromParent();
5608 unsigned SPDenormMode =
5611 if (ST.hasDenormModeInst()) {
5613 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5615 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5616 B.buildInstr(AMDGPU::S_DENORM_MODE)
5617 .addImm(NewDenormModeValue);
5620 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5621 .addImm(SPDenormMode)
5643 auto One =
B.buildFConstant(
S32, 1.0f);
5645 auto DenominatorScaled =
5646 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5651 auto NumeratorScaled =
5652 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5658 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5659 .addUse(DenominatorScaled.getReg(0))
5661 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5664 const bool HasDynamicDenormals =
5669 if (!PreservesDenormals) {
5670 if (HasDynamicDenormals) {
5672 B.buildInstr(AMDGPU::S_GETREG_B32)
5673 .addDef(SavedSPDenormMode)
5679 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5680 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5681 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5682 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5683 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5684 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5686 if (!PreservesDenormals) {
5687 if (HasDynamicDenormals) {
5688 assert(SavedSPDenormMode);
5689 B.buildInstr(AMDGPU::S_SETREG_B32)
5690 .addReg(SavedSPDenormMode)
5696 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5697 .addUse(Fma4.getReg(0))
5698 .addUse(Fma1.getReg(0))
5699 .addUse(Fma3.getReg(0))
5700 .addUse(NumeratorScaled.getReg(1))
5703 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5704 .addUse(Fmas.getReg(0))
5709 MI.eraseFromParent();
5728 auto One =
B.buildFConstant(
S64, 1.0);
5730 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5736 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5738 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5739 .addUse(DivScale0.getReg(0))
5742 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5743 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5744 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5746 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5752 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5753 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5754 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5757 if (!ST.hasUsableDivScaleConditionOutput()) {
5763 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5764 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5765 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5766 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5769 Scale1Unmerge.getReg(1));
5771 Scale0Unmerge.getReg(1));
5772 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5774 Scale = DivScale1.getReg(1);
5777 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5778 .addUse(Fma4.getReg(0))
5779 .addUse(Fma3.getReg(0))
5780 .addUse(
Mul.getReg(0))
5784 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5785 .addUse(Fmas.getReg(0))
5790 MI.eraseFromParent();
5805 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5808 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5812 if (ST.hasFractBug()) {
5813 auto Fabs =
B.buildFAbs(Ty, Val);
5817 auto Zero =
B.buildConstant(InstrExpTy, 0);
5818 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5819 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5822 B.buildCopy(Res0, Mant);
5823 B.buildSExtOrTrunc(Res1, Exp);
5825 MI.eraseFromParent();
5840 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5843 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5844 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5845 auto C2 =
B.buildFConstant(
S32, 1.0f);
5848 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5850 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5852 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5853 .addUse(Mul0.getReg(0))
5856 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5858 B.buildFMul(Res, Sel, Mul1, Flags);
5860 MI.eraseFromParent();
5869 unsigned Flags =
MI.getFlags();
5870 assert(!ST.has16BitInsts());
5872 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5873 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5874 .addUse(Ext.getReg(0))
5876 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5877 MI.eraseFromParent();
5887 const unsigned Flags =
MI.getFlags();
5896 MI.eraseFromParent();
5900 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5902 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5903 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5904 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5909 .addUse(SqrtX.getReg(0))
5912 auto NegOne =
B.buildConstant(I32, -1);
5913 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5915 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5916 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5918 auto PosOne =
B.buildConstant(I32, 1);
5919 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5921 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5922 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5924 auto Zero =
B.buildFConstant(
F32, 0.0f);
5928 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5932 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5935 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5936 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5938 auto Half =
B.buildFConstant(
F32, 0.5f);
5939 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5940 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5941 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5942 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5943 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5944 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5945 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5946 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5949 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5951 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5953 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5956 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5958 MI.eraseFromParent();
5993 unsigned Flags =
MI.getFlags();
5998 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6000 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6004 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6005 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6006 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6009 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6011 auto Half =
B.buildFConstant(
F64, 0.5);
6012 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6013 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6015 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6016 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6018 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6019 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6021 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6022 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6024 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6026 Register SqrtRet = SqrtS2.getReg(0);
6028 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6029 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6030 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6033 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6034 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6035 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6040 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6049 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6051 MI.eraseFromParent();
6082 auto Flags =
MI.getFlags();
6094 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6104 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6105 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6110 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6112 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6113 MI.eraseFromParent();
6125 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6126 IID == Intrinsic::amdgcn_permlanex16;
6127 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6128 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6132 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6134 case Intrinsic::amdgcn_readfirstlane:
6135 case Intrinsic::amdgcn_permlane64:
6136 return LaneOp.getReg(0);
6137 case Intrinsic::amdgcn_readlane:
6138 case Intrinsic::amdgcn_set_inactive:
6139 case Intrinsic::amdgcn_set_inactive_chain_arg:
6140 return LaneOp.addUse(Src1).getReg(0);
6141 case Intrinsic::amdgcn_writelane:
6142 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6143 case Intrinsic::amdgcn_permlane16:
6144 case Intrinsic::amdgcn_permlanex16: {
6146 int64_t Src4 =
MI.getOperand(6).getImm();
6147 int64_t Src5 =
MI.getOperand(7).getImm();
6148 return LaneOp.addUse(Src1)
6155 case Intrinsic::amdgcn_mov_dpp8:
6156 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6157 case Intrinsic::amdgcn_update_dpp:
6158 return LaneOp.addUse(Src1)
6159 .addImm(
MI.getOperand(4).getImm())
6160 .addImm(
MI.getOperand(5).getImm())
6161 .addImm(
MI.getOperand(6).getImm())
6162 .addImm(
MI.getOperand(7).getImm())
6172 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6173 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6174 Src1 =
MI.getOperand(3).getReg();
6175 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6176 Src2 =
MI.getOperand(4).getReg();
6181 unsigned Size = Ty.getSizeInBits();
6183 unsigned SplitSize = 32;
6184 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6185 ST.hasDPALU_DPP() &&
6189 if (
Size == SplitSize) {
6195 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6197 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6200 if (IID == Intrinsic::amdgcn_writelane)
6203 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6204 B.buildTrunc(DstReg, LaneOpDst);
6205 MI.eraseFromParent();
6209 if (
Size % SplitSize != 0)
6213 bool NeedsBitcast =
false;
6214 if (Ty.isVector()) {
6217 if (EltSize == SplitSize) {
6218 PartialResTy = EltTy;
6219 }
else if (EltSize == 16 || EltSize == 32) {
6220 unsigned NElem = SplitSize / EltSize;
6224 NeedsBitcast =
true;
6229 unsigned NumParts =
Size / SplitSize;
6233 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6234 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6236 if (IID == Intrinsic::amdgcn_writelane)
6237 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6239 for (
unsigned i = 0; i < NumParts; ++i) {
6240 Src0 = Src0Parts.
getReg(i);
6242 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6243 Src1 = Src1Parts.
getReg(i);
6245 if (IID == Intrinsic::amdgcn_writelane)
6246 Src2 = Src2Parts.
getReg(i);
6248 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6252 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6255 B.buildMergeLikeInstr(DstReg, PartialRes);
6257 MI.eraseFromParent();
6265 ST.getTargetLowering()->getImplicitParameterOffset(
6275 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6276 B.buildConstant(IdxTy,
Offset).getReg(0));
6287 Register Pointer =
MI.getOperand(2).getReg();
6289 Register NumRecords =
MI.getOperand(4).getReg();
6295 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6297 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6299 if (ST.has45BitNumRecordsBufferResource()) {
6304 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6305 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6306 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6307 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6311 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6312 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6313 auto ExtShiftedStride =
6314 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6315 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6316 auto ExtShiftedFlags =
6317 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6318 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6320 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6321 B.buildMergeValues(Result, {LowHalf, HighHalf});
6323 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6324 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6325 auto LowHalf = Unmerge.getReg(0);
6326 auto HighHalf = Unmerge.getReg(1);
6328 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6329 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6330 auto ShiftConst =
B.buildConstant(
S32, 16);
6331 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6332 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6333 Register NewHighHalfReg = NewHighHalf.getReg(0);
6334 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6337 MI.eraseFromParent();
6354 MI.eraseFromParent();
6362 std::optional<uint32_t> KnownSize =
6364 if (KnownSize.has_value())
6365 B.buildConstant(DstReg, *KnownSize);
6383 MI.eraseFromParent();
6390 unsigned AddrSpace)
const {
6392 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6396 ST.hasGloballyAddressableScratch()) {
6398 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6399 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6401 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6403 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6405 B.buildConstant(
S32, 1u << 26));
6410 MI.eraseFromParent();
6420std::pair<Register, unsigned>
6432 bool CheckNUW = ST.hasGFX1250Insts();
6434 MRI, OrigOffset,
nullptr, CheckNUW);
6438 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6448 unsigned Overflow = ImmOffset & ~MaxImm;
6449 ImmOffset -= Overflow;
6450 if ((int32_t)Overflow < 0) {
6451 Overflow += ImmOffset;
6455 if (Overflow != 0) {
6457 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6459 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6460 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6465 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6467 return std::pair(BaseReg, ImmOffset);
6474 bool ImageStore)
const {
6480 if (ST.hasUnpackedD16VMem()) {
6481 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6484 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6485 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6493 if (ImageStore && ST.hasImageStoreD16Bug()) {
6496 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6498 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6505 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6506 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6508 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6516 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6517 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6519 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6536 bool IsFormat)
const {
6548 VData =
B.buildBitcast(Ty, VData).getReg(0);
6556 if (Ty.isVector()) {
6557 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6569 bool IsFormat)
const {
6576 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6591 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6594 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6598 VIndex =
MI.getOperand(3).getReg();
6601 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6604 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6605 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6609 Format =
MI.getOperand(5 + OpOffset).getImm();
6613 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6619 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6620 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6621 }
else if (IsFormat) {
6622 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6623 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6627 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6630 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6633 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6638 auto MIB =
B.buildInstr(
Opc)
6649 MIB.addImm(AuxiliaryData)
6650 .addImm(HasVIndex ? -1 : 0)
6651 .addMemOperand(MMO);
6653 MI.eraseFromParent();
6659 unsigned ImmOffset,
unsigned Format,
6662 auto MIB =
B.buildInstr(
Opc)
6673 MIB.addImm(AuxiliaryData)
6674 .addImm(HasVIndex ? -1 : 0)
6675 .addMemOperand(MMO);
6681 bool IsTyped)
const {
6695 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6696 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6698 StatusDst =
MI.getOperand(1).getReg();
6703 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6706 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6709 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6712 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6715 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6718 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6719 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6723 Format =
MI.getOperand(5 + OpOffset).getImm();
6727 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6737 Dst =
MI.getOperand(0).getReg();
6738 B.setInsertPt(
B.getMBB(),
MI);
6745 Dst =
MI.getOperand(0).getReg();
6746 B.setInsertPt(
B.getMBB(),
MI);
6750 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6751 const bool Unpacked = ST.hasUnpackedD16VMem();
6761 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6762 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6763 }
else if (IsFormat) {
6767 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6769 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6770 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6775 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6776 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6779 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6780 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6783 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6784 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6790 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6791 unsigned NumLoadDWords = NumValueDWords + 1;
6793 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6795 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6797 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6798 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6799 B.buildTrunc(Dst, ExtDst);
6800 }
else if (NumValueDWords == 1) {
6801 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6804 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6805 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6807 B.buildUnmerge(LoadElts, LoadDstReg);
6809 B.buildMergeLikeInstr(Dst, LoadElts);
6812 (IsD16 && !Ty.isVector())) {
6813 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6815 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6816 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6817 B.buildTrunc(Dst, LoadDstReg);
6818 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6820 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6822 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6823 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6825 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6827 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6828 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6829 B.buildMergeLikeInstr(Dst, Repack);
6832 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6835 MI.eraseFromParent();
6841 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6842 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6843 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6844 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6845 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6846 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6847 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6848 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6849 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6850 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6851 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6852 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6853 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6856 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6858 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6859 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6861 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6862 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6863 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6866 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6867 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6868 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6871 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6873 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6876 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6878 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6881 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6883 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6886 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6888 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6891 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6939 const bool IsCmpSwap =
6940 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6941 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6942 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6943 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6954 CmpVal =
MI.getOperand(3).getReg();
6959 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6960 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6963 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6966 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6969 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6972 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6973 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6974 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6993 .addImm(AuxiliaryData)
6994 .addImm(HasVIndex ? -1 : 0)
6995 .addMemOperand(MMO);
6997 MI.eraseFromParent();
7007 bool IsA16,
bool IsG16) {
7023 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7028 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7032 "Bias needs to be converted to 16 bit in A16 mode");
7034 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7040 if (((
I + 1) >= EndIdx) ||
7047 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7049 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7054 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7065 int DimIdx,
int NumVAddrs) {
7069 for (
int I = 0;
I != NumVAddrs; ++
I) {
7071 if (
SrcOp.isReg()) {
7077 int NumAddrRegs = AddrRegs.
size();
7078 if (NumAddrRegs != 1) {
7081 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7084 for (
int I = 1;
I != NumVAddrs; ++
I) {
7087 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7109 const unsigned NumDefs =
MI.getNumExplicitDefs();
7110 const unsigned ArgOffset = NumDefs + 1;
7111 bool IsTFE = NumDefs == 2;
7129 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7133 const bool IsAtomicPacked16Bit =
7134 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7135 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7143 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7144 const bool IsA16 = AddrTy ==
S16;
7145 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7148 if (!BaseOpcode->
Atomic) {
7149 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7152 }
else if (DMask != 0) {
7154 }
else if (!IsTFE && !BaseOpcode->
Store) {
7156 B.buildUndef(
MI.getOperand(0));
7157 MI.eraseFromParent();
7165 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7166 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7167 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7168 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7169 unsigned NewOpcode = LoadOpcode;
7170 if (BaseOpcode->
Store)
7171 NewOpcode = StoreOpcode;
7173 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7176 MI.setDesc(
B.getTII().get(NewOpcode));
7180 if (IsTFE && DMask == 0) {
7183 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7186 if (BaseOpcode->
Atomic) {
7191 if (Ty.isVector() && !IsAtomicPacked16Bit)
7198 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7199 MI.getOperand(2).setReg(
Concat.getReg(0));
7200 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7204 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7207 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7213 if (IsA16 && !ST.hasA16()) {
7218 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7219 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7221 if (IsA16 || IsG16) {
7229 const bool UseNSA = ST.hasNSAEncoding() &&
7230 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7231 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7232 const bool UsePartialNSA =
7233 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7235 if (UsePartialNSA) {
7239 auto Concat =
B.buildConcatVectors(
7240 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7241 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7242 PackedRegs.
resize(NSAMaxSize);
7243 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7245 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7246 PackedRegs[0] =
Concat.getReg(0);
7250 const unsigned NumPacked = PackedRegs.
size();
7253 if (!
SrcOp.isReg()) {
7263 SrcOp.setReg(AMDGPU::NoRegister);
7280 const bool UseNSA = ST.hasNSAEncoding() &&
7281 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7282 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7283 const bool UsePartialNSA =
7284 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7286 if (UsePartialNSA) {
7288 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7290 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7305 if (!Ty.isVector() || !IsD16)
7309 if (RepackedReg != VData) {
7310 MI.getOperand(1).setReg(RepackedReg);
7318 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7321 if (NumElts < DMaskLanes)
7324 if (NumElts > 4 || DMaskLanes > 4)
7334 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7335 const LLT AdjustedTy =
7351 if (IsD16 && ST.hasUnpackedD16VMem()) {
7358 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7359 unsigned RoundedSize = 32 * RoundedElts;
7363 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7368 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7374 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7378 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7379 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7383 MI.getOperand(0).setReg(NewResultReg);
7391 Dst1Reg =
MI.getOperand(1).getReg();
7396 MI.removeOperand(1);
7400 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7409 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7411 if (ResultNumRegs == 1) {
7413 ResultRegs[0] = NewResultReg;
7416 for (
int I = 0;
I != NumDataRegs; ++
I)
7418 B.buildUnmerge(ResultRegs, NewResultReg);
7423 ResultRegs.
resize(NumDataRegs);
7428 if (IsD16 && !Ty.isVector()) {
7429 B.buildTrunc(DstReg, ResultRegs[0]);
7434 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7435 B.buildBitcast(DstReg, ResultRegs[0]);
7447 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7449 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7450 }
else if (ST.hasUnpackedD16VMem()) {
7452 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7456 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7460 for (
int I = 0;
I != NumElts; ++
I)
7467 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7468 B.buildBuildVector(DstReg, ResultRegs);
7472 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7473 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7479 if (ResultRegs.
size() == 1) {
7480 NewResultReg = ResultRegs[0];
7481 }
else if (ResultRegs.
size() == 2) {
7483 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7491 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7493 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7498 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7499 B.buildConcatVectors(DstReg, ResultRegs);
7508 Register OrigDst =
MI.getOperand(0).getReg();
7510 LLT Ty =
B.getMRI()->getType(OrigDst);
7511 unsigned Size = Ty.getSizeInBits();
7514 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7516 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7517 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7520 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7522 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7531 B.setInsertPt(
B.getMBB(),
MI);
7536 B.setInsertPt(
B.getMBB(),
MI);
7542 MI.setDesc(
B.getTII().get(
Opc));
7543 MI.removeOperand(1);
7546 const unsigned MemSize = (
Size + 7) / 8;
7547 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7554 MI.addMemOperand(MF, MMO);
7555 if (Dst != OrigDst) {
7556 MI.getOperand(0).setReg(Dst);
7557 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7558 B.buildTrunc(OrigDst, Dst);
7580 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7581 MI.removeOperand(0);
7591 if (!ST.hasTrapHandler() ||
7595 return ST.supportsGetDoorbellID() ?
7608 MI.eraseFromParent();
7618 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7620 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7624 MI.eraseFromParent();
7633 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7640 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7660 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7663 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7664 B.buildCopy(SGPR01, Temp);
7665 B.buildInstr(AMDGPU::S_TRAP)
7668 MI.eraseFromParent();
7679 B.buildCopy(SGPR01, LiveIn);
7680 B.buildInstr(AMDGPU::S_TRAP)
7684 MI.eraseFromParent();
7693 if (ST.hasPrivEnabledTrap2NopBug()) {
7694 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7696 MI.eraseFromParent();
7700 B.buildInstr(AMDGPU::S_TRAP)
7702 MI.eraseFromParent();
7711 if (!ST.hasTrapHandler() ||
7715 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7718 B.buildInstr(AMDGPU::S_TRAP)
7722 MI.eraseFromParent();
7735 Register NodePtr =
MI.getOperand(2).getReg();
7736 Register RayExtent =
MI.getOperand(3).getReg();
7737 Register RayOrigin =
MI.getOperand(4).getReg();
7739 Register RayInvDir =
MI.getOperand(6).getReg();
7742 if (!ST.hasGFX10_AEncoding()) {
7745 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7754 const unsigned NumVDataDwords = 4;
7755 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7756 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7758 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7760 const unsigned BaseOpcodes[2][2] = {
7761 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7762 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7763 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7767 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7768 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7769 : AMDGPU::MIMGEncGfx10NSA,
7770 NumVDataDwords, NumVAddrDwords);
7774 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7775 : AMDGPU::MIMGEncGfx10Default,
7776 NumVDataDwords, NumVAddrDwords);
7781 if (UseNSA && IsGFX11Plus) {
7783 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7784 auto Merged =
B.buildMergeLikeInstr(
7785 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7786 Ops.push_back(Merged.getReg(0));
7789 Ops.push_back(NodePtr);
7790 Ops.push_back(RayExtent);
7791 packLanes(RayOrigin);
7794 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7795 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7796 auto MergedDir =
B.buildMergeLikeInstr(
7799 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7800 UnmergeRayDir.getReg(0)}))
7803 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7804 UnmergeRayDir.getReg(1)}))
7807 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7808 UnmergeRayDir.getReg(2)}))
7810 Ops.push_back(MergedDir.getReg(0));
7813 packLanes(RayInvDir);
7817 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7818 Ops.push_back(Unmerge.getReg(0));
7819 Ops.push_back(Unmerge.getReg(1));
7821 Ops.push_back(NodePtr);
7823 Ops.push_back(RayExtent);
7826 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7827 Ops.push_back(Unmerge.getReg(0));
7828 Ops.push_back(Unmerge.getReg(1));
7829 Ops.push_back(Unmerge.getReg(2));
7832 packLanes(RayOrigin);
7834 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7835 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7839 B.buildMergeLikeInstr(R1,
7840 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7841 B.buildMergeLikeInstr(
7842 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7843 B.buildMergeLikeInstr(
7844 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7850 packLanes(RayInvDir);
7857 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7859 Ops.push_back(MergedOps);
7862 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7871 .addImm(IsA16 ? 1 : 0)
7874 MI.eraseFromParent();
7884 Register DstOrigin =
MI.getOperand(1).getReg();
7886 Register NodePtr =
MI.getOperand(4).getReg();
7887 Register RayExtent =
MI.getOperand(5).getReg();
7888 Register InstanceMask =
MI.getOperand(6).getReg();
7889 Register RayOrigin =
MI.getOperand(7).getReg();
7891 Register Offsets =
MI.getOperand(9).getReg();
7892 Register TDescr =
MI.getOperand(10).getReg();
7894 if (!ST.hasBVHDualAndBVH8Insts()) {
7897 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7902 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7903 const unsigned NumVDataDwords = 10;
7904 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7906 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7907 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7908 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7911 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7912 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7914 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7915 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7921 .addUse(RayExtentInstanceMaskVec.getReg(0))
7928 MI.eraseFromParent();
7937 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7938 MI.eraseFromParent();
7945 if (!ST.hasArchitectedSGPRs())
7949 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7950 auto LSB =
B.buildConstant(
S32, 25);
7951 auto Width =
B.buildConstant(
S32, 5);
7952 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7953 MI.eraseFromParent();
7961 unsigned Width)
const {
7965 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7966 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7969 MI.eraseFromParent();
7987 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7991 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7994 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7995 MI.eraseFromParent();
8006 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8010 .addReg(Unmerge.getReg(0));
8014 .addReg(Unmerge.getReg(1));
8015 MI.eraseFromParent();
8027 case Intrinsic::sponentry:
8033 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8036 B.buildIntToPtr(DstReg, TmpReg);
8037 MI.eraseFromParent();
8039 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8041 B.buildFrameIndex(
MI.getOperand(0), FI);
8042 MI.eraseFromParent();
8045 case Intrinsic::amdgcn_if:
8046 case Intrinsic::amdgcn_else: {
8049 bool Negated =
false;
8061 std::swap(CondBrTarget, UncondBrTarget);
8063 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8064 if (IntrID == Intrinsic::amdgcn_if) {
8065 B.buildInstr(AMDGPU::SI_IF)
8068 .addMBB(UncondBrTarget);
8070 B.buildInstr(AMDGPU::SI_ELSE)
8073 .addMBB(UncondBrTarget);
8082 B.buildBr(*CondBrTarget);
8087 MI.eraseFromParent();
8088 BrCond->eraseFromParent();
8094 case Intrinsic::amdgcn_loop: {
8097 bool Negated =
false;
8107 std::swap(CondBrTarget, UncondBrTarget);
8109 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8110 B.buildInstr(AMDGPU::SI_LOOP)
8112 .addMBB(UncondBrTarget);
8117 B.buildBr(*CondBrTarget);
8119 MI.eraseFromParent();
8120 BrCond->eraseFromParent();
8127 case Intrinsic::amdgcn_addrspacecast_nonnull:
8129 case Intrinsic::amdgcn_make_buffer_rsrc:
8131 case Intrinsic::amdgcn_kernarg_segment_ptr:
8134 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8135 MI.eraseFromParent();
8141 case Intrinsic::amdgcn_implicitarg_ptr:
8143 case Intrinsic::amdgcn_workitem_id_x:
8146 case Intrinsic::amdgcn_workitem_id_y:
8149 case Intrinsic::amdgcn_workitem_id_z:
8152 case Intrinsic::amdgcn_workgroup_id_x:
8157 case Intrinsic::amdgcn_workgroup_id_y:
8162 case Intrinsic::amdgcn_workgroup_id_z:
8167 case Intrinsic::amdgcn_cluster_id_x:
8168 return ST.hasClusters() &&
8171 case Intrinsic::amdgcn_cluster_id_y:
8172 return ST.hasClusters() &&
8175 case Intrinsic::amdgcn_cluster_id_z:
8176 return ST.hasClusters() &&
8179 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8180 return ST.hasClusters() &&
8183 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8184 return ST.hasClusters() &&
8187 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8188 return ST.hasClusters() &&
8191 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8192 return ST.hasClusters() &&
8194 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8195 return ST.hasClusters() &&
8198 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8199 return ST.hasClusters() &&
8202 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8203 return ST.hasClusters() &&
8206 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8207 return ST.hasClusters() &&
8211 case Intrinsic::amdgcn_wave_id:
8213 case Intrinsic::amdgcn_lds_kernel_id:
8216 case Intrinsic::amdgcn_dispatch_ptr:
8219 case Intrinsic::amdgcn_queue_ptr:
8222 case Intrinsic::amdgcn_implicit_buffer_ptr:
8225 case Intrinsic::amdgcn_dispatch_id:
8228 case Intrinsic::r600_read_ngroups_x:
8232 case Intrinsic::r600_read_ngroups_y:
8235 case Intrinsic::r600_read_ngroups_z:
8238 case Intrinsic::r600_read_local_size_x:
8241 case Intrinsic::r600_read_local_size_y:
8245 case Intrinsic::r600_read_local_size_z:
8248 case Intrinsic::amdgcn_fdiv_fast:
8250 case Intrinsic::amdgcn_is_shared:
8252 case Intrinsic::amdgcn_is_private:
8254 case Intrinsic::amdgcn_wavefrontsize: {
8255 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8256 MI.eraseFromParent();
8259 case Intrinsic::amdgcn_s_buffer_load:
8261 case Intrinsic::amdgcn_raw_buffer_store:
8262 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8263 case Intrinsic::amdgcn_struct_buffer_store:
8264 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8266 case Intrinsic::amdgcn_raw_buffer_store_format:
8267 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8268 case Intrinsic::amdgcn_struct_buffer_store_format:
8269 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8271 case Intrinsic::amdgcn_raw_tbuffer_store:
8272 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8273 case Intrinsic::amdgcn_struct_tbuffer_store:
8274 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8276 case Intrinsic::amdgcn_raw_buffer_load:
8277 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8278 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8279 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8280 case Intrinsic::amdgcn_struct_buffer_load:
8281 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8282 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8283 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8285 case Intrinsic::amdgcn_raw_buffer_load_format:
8286 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8287 case Intrinsic::amdgcn_struct_buffer_load_format:
8288 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8290 case Intrinsic::amdgcn_raw_tbuffer_load:
8291 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8292 case Intrinsic::amdgcn_struct_tbuffer_load:
8293 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8295 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8296 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8297 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8298 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8299 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8300 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8301 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8302 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8303 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8305 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8306 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8307 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8308 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8309 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8310 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8311 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8312 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8313 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8314 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8315 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8316 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8317 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8319 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8320 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8321 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8322 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8323 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8324 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8325 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8326 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8327 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8328 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8329 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8330 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8331 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8332 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8333 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8334 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8335 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8336 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8337 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8338 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8339 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8340 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8341 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8342 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8343 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8344 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8345 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8346 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8347 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8348 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8349 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8350 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8351 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8352 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8353 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8354 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8355 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8356 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8357 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8358 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8359 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8360 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8361 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8362 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8363 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8364 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8365 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8366 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8368 case Intrinsic::amdgcn_rsq_clamp:
8370 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8372 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8373 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8375 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8376 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8377 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8378 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8379 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8380 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8381 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8382 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8386 if (IndexArgTy !=
S64) {
8387 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8388 :
B.buildAnyExt(
S64, Index);
8389 MI.getOperand(5).setReg(NewIndex.getReg(0));
8393 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8394 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8395 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8396 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8397 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8398 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8399 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8400 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8404 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8407 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8408 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8409 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8410 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8411 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8412 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8413 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8414 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8415 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8417 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8421 if (IndexArgTy != IdxTy) {
8422 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8423 :
B.buildAnyExt(IdxTy, Index);
8424 MI.getOperand(7).setReg(NewIndex.getReg(0));
8429 case Intrinsic::amdgcn_fmed3: {
8435 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8436 MI.removeOperand(1);
8440 case Intrinsic::amdgcn_readlane:
8441 case Intrinsic::amdgcn_writelane:
8442 case Intrinsic::amdgcn_readfirstlane:
8443 case Intrinsic::amdgcn_permlane16:
8444 case Intrinsic::amdgcn_permlanex16:
8445 case Intrinsic::amdgcn_permlane64:
8446 case Intrinsic::amdgcn_set_inactive:
8447 case Intrinsic::amdgcn_set_inactive_chain_arg:
8448 case Intrinsic::amdgcn_mov_dpp8:
8449 case Intrinsic::amdgcn_update_dpp:
8451 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8453 case Intrinsic::amdgcn_dead: {
8457 MI.eraseFromParent();
8460 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8461 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8462 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8463 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8464 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8465 MI.eraseFromParent();
8467 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8468 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8469 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8470 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8471 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8472 MI.eraseFromParent();
8474 case Intrinsic::amdgcn_flat_load_monitor_b32:
8475 case Intrinsic::amdgcn_flat_load_monitor_b64:
8476 case Intrinsic::amdgcn_flat_load_monitor_b128:
8477 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8478 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8479 .add(
MI.getOperand(0))
8480 .add(
MI.getOperand(2))
8481 .addMemOperand(*
MI.memoperands_begin());
8482 MI.eraseFromParent();
8484 case Intrinsic::amdgcn_global_load_monitor_b32:
8485 case Intrinsic::amdgcn_global_load_monitor_b64:
8486 case Intrinsic::amdgcn_global_load_monitor_b128:
8487 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8488 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8489 .add(
MI.getOperand(0))
8490 .add(
MI.getOperand(2))
8491 .addMemOperand(*
MI.memoperands_begin());
8492 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.