37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
758 if (ST.hasScalarAddSub64()) {
761 .clampMaxNumElementsStrict(0,
S16, 2)
769 .clampMaxNumElementsStrict(0,
S16, 2)
776 if (ST.hasScalarSMulU64()) {
779 .clampMaxNumElementsStrict(0,
S16, 2)
787 .clampMaxNumElementsStrict(0,
S16, 2)
797 .minScalarOrElt(0,
S16)
802 }
else if (ST.has16BitInsts()) {
836 .widenScalarToNextMultipleOf(0, 32)
846 if (ST.hasMad64_32())
851 if (ST.hasIntClamp()) {
874 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
884 if (ST.hasVOP3PInsts()) {
886 .clampMaxNumElements(0,
S8, 2)
907 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
926 .clampScalar(0,
S16,
S64);
959 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
971 FPOpActions.legalFor({
S16});
973 TrigActions.customFor({
S16});
974 FDIVActions.customFor({
S16});
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor({
V2S32});
979 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
982 auto &MinNumMaxNumIeee =
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(FPTypesPK16)
988 .clampMaxNumElements(0,
S16, 2)
991 }
else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
994 MinNumMaxNumIeee.legalFor(FPTypesBase)
1000 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(FPTypesPK16)
1005 .clampMaxNumElements(0,
S16, 2)
1006 .clampScalar(0,
S16,
S64)
1008 }
else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(FPTypes16)
1010 .clampScalar(0,
S16,
S64)
1013 MinNumMaxNum.customFor(FPTypesBase)
1014 .clampScalar(0,
S32,
S64)
1018 if (ST.hasVOP3PInsts())
1034 .legalFor(FPTypesPK16)
1039 if (ST.has16BitInsts()) {
1073 if (ST.hasFractBug()) {
1107 if (ST.hasCvtPkF16F32Inst()) {
1109 .clampMaxNumElements(0,
S16, 2);
1113 FPTruncActions.scalarize(0).lower();
1121 if (ST.has16BitInsts()) {
1141 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1142 FMad.customFor({
S32,
S16});
1143 else if (ST.hasMadMacF32Insts())
1144 FMad.customFor({
S32});
1145 else if (ST.hasMadF16())
1146 FMad.customFor({
S16});
1151 if (ST.has16BitInsts()) {
1154 FRem.minScalar(0,
S32)
1163 .clampMaxNumElements(0,
S16, 2)
1182 if (ST.has16BitInsts())
1193 if (ST.has16BitInsts())
1207 if (ST.has16BitInsts())
1218 .clampScalar(0,
S16,
S64)
1233 .clampScalar(0,
S16,
S64)
1237 if (ST.has16BitInsts()) {
1239 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1241 .clampScalar(0,
S16,
S64)
1245 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1247 .clampScalar(0,
S32,
S64)
1251 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1254 .clampScalar(0,
S32,
S64)
1266 .scalarSameSizeAs(1, 0)
1282 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1283 .legalForCartesianProduct(
1284 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1285 if (ST.has16BitInsts()) {
1286 CmpBuilder.legalFor({{
S1,
S16}});
1297 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1299 if (ST.hasSALUFloatInsts())
1309 if (ST.has16BitInsts())
1310 ExpOps.customFor({{
S32}, {
S16}});
1312 ExpOps.customFor({
S32});
1313 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1321 .
legalFor(ST.has16BitInsts(), {S16})
1327 .
legalFor(ST.has16BitInsts(), {S16})
1341 .clampScalar(0,
S32,
S32)
1348 if (ST.has16BitInsts())
1351 .widenScalarToNextPow2(1)
1357 .lowerFor({
S1,
S16})
1358 .widenScalarToNextPow2(1)
1385 .clampScalar(0,
S32,
S32)
1401 .clampScalar(0,
S32,
S64)
1405 if (ST.has16BitInsts()) {
1408 .clampMaxNumElementsStrict(0,
S16, 2)
1415 if (ST.hasVOP3PInsts()) {
1418 .clampMaxNumElements(0,
S16, 2)
1423 if (ST.hasIntMinMax64()) {
1426 .clampMaxNumElements(0,
S16, 2)
1434 .clampMaxNumElements(0,
S16, 2)
1443 .widenScalarToNextPow2(0)
1471 .legalForCartesianProduct(AddrSpaces32, {
S32})
1487 .legalForCartesianProduct(AddrSpaces32, {
S32})
1504 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1505 bool IsLoad) ->
bool {
1509 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1523 unsigned NumRegs = (MemSize + 31) / 32;
1525 if (!ST.hasDwordx3LoadStores())
1536 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1537 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1538 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1544 for (
unsigned Op : {G_LOAD, G_STORE}) {
1545 const bool IsStore =
Op == G_STORE;
1550 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1553 {
S64, GlobalPtr,
S64, GlobalAlign32},
1556 {
S32, GlobalPtr,
S8, GlobalAlign8},
1557 {
S32, GlobalPtr,
S16, GlobalAlign16},
1559 {
S32, LocalPtr,
S32, 32},
1560 {
S64, LocalPtr,
S64, 32},
1562 {
S32, LocalPtr,
S8, 8},
1563 {
S32, LocalPtr,
S16, 16},
1566 {
S32, PrivatePtr,
S32, 32},
1567 {
S32, PrivatePtr,
S8, 8},
1568 {
S32, PrivatePtr,
S16, 16},
1571 {
S32, ConstantPtr,
S32, GlobalAlign32},
1574 {
S64, ConstantPtr,
S64, GlobalAlign32},
1575 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1584 Actions.unsupportedIf(
1585 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1599 Actions.customIf(
typeIs(1, Constant32Ptr));
1625 return !Query.
Types[0].isVector() &&
1626 needToSplitMemOp(Query,
Op == G_LOAD);
1628 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1633 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1636 if (DstSize > MemSize)
1642 if (MemSize > MaxSize)
1650 return Query.
Types[0].isVector() &&
1651 needToSplitMemOp(Query,
Op == G_LOAD);
1653 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1667 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1668 if (MemSize > MaxSize) {
1672 if (MaxSize % EltSize == 0) {
1678 unsigned NumPieces = MemSize / MaxSize;
1682 if (NumPieces == 1 || NumPieces >= NumElts ||
1683 NumElts % NumPieces != 0)
1684 return std::pair(0, EltTy);
1692 return std::pair(0, EltTy);
1707 return std::pair(0, EltTy);
1712 .widenScalarToNextPow2(0)
1719 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1720 {
S32, GlobalPtr,
S16, 2 * 8},
1721 {
S32, LocalPtr,
S8, 8},
1722 {
S32, LocalPtr,
S16, 16},
1723 {
S32, PrivatePtr,
S8, 8},
1724 {
S32, PrivatePtr,
S16, 16},
1725 {
S32, ConstantPtr,
S8, 8},
1726 {
S32, ConstantPtr,
S16, 2 * 8}})
1732 if (ST.hasFlatAddressSpace()) {
1733 ExtLoads.legalForTypesWithMemDesc(
1734 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1749 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1750 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1751 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1752 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1753 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1754 {
S64, GlobalPtr}, {
S64, LocalPtr},
1755 {
S32, RegionPtr}, {
S64, RegionPtr}});
1756 if (ST.hasFlatAddressSpace()) {
1757 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1762 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1763 if (ST.hasFlatAddressSpace()) {
1764 Atomics32.legalFor({{
S32, FlatPtr}});
1769 if (ST.hasLDSFPAtomicAddF32()) {
1770 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1771 if (ST.hasLdsAtomicAddF64())
1772 Atomic.legalFor({{
S64, LocalPtr}});
1773 if (ST.hasAtomicDsPkAdd16Insts())
1774 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1776 if (ST.hasAtomicFaddInsts())
1777 Atomic.legalFor({{
S32, GlobalPtr}});
1778 if (ST.hasFlatAtomicFaddF32Inst())
1779 Atomic.legalFor({{
S32, FlatPtr}});
1781 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1792 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1793 ST.hasAtomicBufferGlobalPkAddF16Insts())
1794 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1795 if (ST.hasAtomicGlobalPkAddBF16Inst())
1796 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1797 if (ST.hasAtomicFlatPkAdd16Insts())
1798 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1803 auto &AtomicFMinFMax =
1805 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1807 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1809 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1810 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1811 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1813 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1820 {
S32, FlatPtr}, {
S64, FlatPtr}})
1821 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1822 {
S32, RegionPtr}, {
S64, RegionPtr}});
1828 LocalPtr, FlatPtr, PrivatePtr,
1832 .clampScalar(0,
S16,
S64)
1847 if (ST.has16BitInsts()) {
1848 if (ST.hasVOP3PInsts()) {
1850 .clampMaxNumElements(0,
S16, 2);
1852 Shifts.legalFor({{
S16,
S16}});
1855 Shifts.widenScalarIf(
1860 const LLT AmountTy = Query.
Types[1];
1865 Shifts.clampScalar(1,
S32,
S32);
1866 Shifts.widenScalarToNextPow2(0, 16);
1867 Shifts.clampScalar(0,
S16,
S64);
1877 Shifts.clampScalar(1,
S32,
S32);
1878 Shifts.widenScalarToNextPow2(0, 32);
1879 Shifts.clampScalar(0,
S32,
S64);
1888 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1889 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1890 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1891 unsigned IdxTypeIdx = 2;
1895 const LLT EltTy = Query.
Types[EltTypeIdx];
1896 const LLT VecTy = Query.
Types[VecTypeIdx];
1897 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1899 const bool isLegalVecType =
1909 return (EltSize == 32 || EltSize == 64) &&
1925 const LLT EltTy = Query.
Types[EltTypeIdx];
1926 const LLT VecTy = Query.
Types[VecTypeIdx];
1930 const unsigned TargetEltSize =
1931 DstEltSize % 64 == 0 ? 64 : 32;
1932 return std::pair(VecTypeIdx,
1936 .clampScalar(EltTypeIdx,
S32,
S64)
1950 const LLT &EltTy = Query.
Types[1].getElementType();
1951 return Query.
Types[0] != EltTy;
1954 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1955 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1956 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1960 const LLT BigTy = Query.
Types[BigTyIdx];
1966 const LLT LitTy = Query.
Types[LitTyIdx];
1979 const LLT BigTy = Query.
Types[BigTyIdx];
1980 const LLT LitTy = Query.
Types[LitTyIdx];
1997 if (ST.hasScalarPackInsts()) {
2000 .minScalarOrElt(0,
S16)
2007 BuildVector.customFor({
V2S16,
S16});
2008 BuildVector.minScalarOrElt(0,
S32);
2027 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2028 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2029 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2031 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2032 const LLT Ty = Query.
Types[TypeIdx];
2033 if (Ty.isVector()) {
2048 const LLT BigTy = Query.
Types[BigTyIdx];
2068 return notValidElt(Query, LitTyIdx);
2073 return notValidElt(Query, BigTyIdx);
2078 if (
Op == G_MERGE_VALUES) {
2079 Builder.widenScalarIf(
2082 const LLT Ty = Query.
Types[LitTyIdx];
2083 return Ty.getSizeInBits() < 32;
2090 const LLT Ty = Query.
Types[BigTyIdx];
2091 return Ty.getSizeInBits() % 16 != 0;
2096 const LLT &Ty = Query.
Types[BigTyIdx];
2097 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2098 if (NewSizeInBits >= 256) {
2099 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2100 if (RoundedTo < NewSizeInBits)
2101 NewSizeInBits = RoundedTo;
2103 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2114 .clampScalar(0,
S32,
S64);
2116 if (ST.hasVOP3PInsts()) {
2117 SextInReg.lowerFor({{
V2S16}})
2121 .clampMaxNumElementsStrict(0,
S16, 2);
2122 }
else if (ST.has16BitInsts()) {
2123 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2127 SextInReg.lowerFor({{
S32}, {
S64}});
2140 FSHRActionDefs.legalFor({{
S32,
S32}})
2141 .clampMaxNumElementsStrict(0,
S16, 2);
2142 if (ST.hasVOP3PInsts())
2144 FSHRActionDefs.scalarize(0).lower();
2146 if (ST.hasVOP3PInsts()) {
2149 .clampMaxNumElementsStrict(0,
S16, 2)
2173 .clampScalar(1,
S32,
S32)
2182 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2183 G_READ_REGISTER, G_WRITE_REGISTER,
2188 if (ST.hasIEEEMinimumMaximumInsts()) {
2190 .legalFor(FPTypesPK16)
2193 }
else if (ST.hasVOP3PInsts()) {
2196 .clampMaxNumElementsStrict(0,
S16, 2)
2212 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2213 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2219 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2220 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2221 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2222 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2228 verify(*ST.getInstrInfo());
2237 switch (
MI.getOpcode()) {
2238 case TargetOpcode::G_ADDRSPACE_CAST:
2240 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2242 case TargetOpcode::G_FCEIL:
2244 case TargetOpcode::G_FREM:
2246 case TargetOpcode::G_INTRINSIC_TRUNC:
2248 case TargetOpcode::G_SITOFP:
2250 case TargetOpcode::G_UITOFP:
2252 case TargetOpcode::G_FPTOSI:
2254 case TargetOpcode::G_FPTOUI:
2256 case TargetOpcode::G_FMINNUM:
2257 case TargetOpcode::G_FMAXNUM:
2258 case TargetOpcode::G_FMINIMUMNUM:
2259 case TargetOpcode::G_FMAXIMUMNUM:
2261 case TargetOpcode::G_EXTRACT:
2263 case TargetOpcode::G_INSERT:
2265 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2267 case TargetOpcode::G_INSERT_VECTOR_ELT:
2269 case TargetOpcode::G_FSIN:
2270 case TargetOpcode::G_FCOS:
2272 case TargetOpcode::G_GLOBAL_VALUE:
2274 case TargetOpcode::G_LOAD:
2275 case TargetOpcode::G_SEXTLOAD:
2276 case TargetOpcode::G_ZEXTLOAD:
2278 case TargetOpcode::G_STORE:
2280 case TargetOpcode::G_FMAD:
2282 case TargetOpcode::G_FDIV:
2284 case TargetOpcode::G_FFREXP:
2286 case TargetOpcode::G_FSQRT:
2288 case TargetOpcode::G_UDIV:
2289 case TargetOpcode::G_UREM:
2290 case TargetOpcode::G_UDIVREM:
2292 case TargetOpcode::G_SDIV:
2293 case TargetOpcode::G_SREM:
2294 case TargetOpcode::G_SDIVREM:
2296 case TargetOpcode::G_ATOMIC_CMPXCHG:
2298 case TargetOpcode::G_FLOG2:
2300 case TargetOpcode::G_FLOG:
2301 case TargetOpcode::G_FLOG10:
2303 case TargetOpcode::G_FEXP2:
2305 case TargetOpcode::G_FEXP:
2306 case TargetOpcode::G_FEXP10:
2308 case TargetOpcode::G_FPOW:
2310 case TargetOpcode::G_FFLOOR:
2312 case TargetOpcode::G_BUILD_VECTOR:
2313 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2315 case TargetOpcode::G_MUL:
2317 case TargetOpcode::G_CTLZ:
2318 case TargetOpcode::G_CTTZ:
2320 case TargetOpcode::G_CTLS:
2322 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2324 case TargetOpcode::G_STACKSAVE:
2326 case TargetOpcode::G_GET_FPENV:
2328 case TargetOpcode::G_SET_FPENV:
2330 case TargetOpcode::G_TRAP:
2332 case TargetOpcode::G_DEBUGTRAP:
2352 if (ST.hasApertureRegs()) {
2357 ? AMDGPU::SRC_SHARED_BASE
2358 : AMDGPU::SRC_PRIVATE_BASE;
2359 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2360 !ST.hasGloballyAddressableScratch()) &&
2361 "Cannot use src_private_base with globally addressable scratch!");
2364 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2365 return B.buildUnmerge(
S32, Dst).getReg(1);
2380 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2396 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2399 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2421 B.buildObjectPtrOffset(
2423 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2424 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2432 switch (Def->getOpcode()) {
2433 case AMDGPU::G_FRAME_INDEX:
2434 case AMDGPU::G_GLOBAL_VALUE:
2435 case AMDGPU::G_BLOCK_ADDR:
2437 case AMDGPU::G_CONSTANT: {
2438 const ConstantInt *CI = Def->getOperand(1).getCImm();
2455 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2457 Intrinsic::amdgcn_addrspacecast_nonnull));
2462 :
MI.getOperand(1).getReg();
2466 unsigned SrcAS = SrcTy.getAddressSpace();
2476 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2483 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2485 ST.hasGloballyAddressableScratch()) {
2489 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2491 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2492 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2494 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2496 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2500 return B.buildExtract(Dst, Src, 0).getReg(0);
2506 castFlatToLocalOrPrivate(Dst);
2507 MI.eraseFromParent();
2513 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2514 auto FlatNull =
B.buildConstant(SrcTy, 0);
2517 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2521 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2523 MI.eraseFromParent();
2530 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2533 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2536 ST.hasGloballyAddressableScratch()) {
2541 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2545 if (ST.isWave64()) {
2546 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2552 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2553 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2555 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2559 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2560 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2562 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2563 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2572 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2578 castLocalOrPrivateToFlat(Dst);
2579 MI.eraseFromParent();
2583 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2590 SegmentNull.getReg(0));
2592 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2594 MI.eraseFromParent();
2599 SrcTy.getSizeInBits() == 64) {
2601 B.buildExtract(Dst, Src, 0);
2602 MI.eraseFromParent();
2609 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2610 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2611 if (AddrHiVal == 0) {
2613 B.buildIntToPtr(Dst, Zext);
2615 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2616 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2619 MI.eraseFromParent();
2626 MI.eraseFromParent();
2635 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2640 auto C1 =
B.buildFConstant(Ty, C1Val);
2641 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2644 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2645 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2647 auto C2 =
B.buildFConstant(Ty, C2Val);
2648 auto Fabs =
B.buildFAbs(Ty, Src);
2651 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2652 MI.eraseFromParent();
2670 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2672 const auto Zero =
B.buildFConstant(
S64, 0.0);
2673 const auto One =
B.buildFConstant(
S64, 1.0);
2676 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2677 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2680 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2681 MI.eraseFromParent();
2689 Register Src0Reg =
MI.getOperand(1).getReg();
2690 Register Src1Reg =
MI.getOperand(2).getReg();
2691 auto Flags =
MI.getFlags();
2694 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2695 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2696 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2697 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2698 MI.eraseFromParent();
2704 const unsigned FractBits = 52;
2705 const unsigned ExpBits = 11;
2708 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2709 auto Const1 =
B.buildConstant(
S32, ExpBits);
2711 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2713 .addUse(Const0.getReg(0))
2714 .addUse(Const1.getReg(0));
2716 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2730 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2737 const unsigned FractBits = 52;
2740 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2741 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2743 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2745 const auto Zero32 =
B.buildConstant(
S32, 0);
2748 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2750 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2751 auto Not =
B.buildNot(
S64, Shr);
2752 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2753 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2758 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2759 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2760 MI.eraseFromParent();
2776 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2777 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2780 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2781 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2783 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2784 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2787 B.buildFAdd(Dst, LdExp, CvtLo);
2788 MI.eraseFromParent();
2794 auto One =
B.buildConstant(
S32, 1);
2798 auto ThirtyOne =
B.buildConstant(
S32, 31);
2799 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2800 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2801 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2802 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2803 .addUse(Unmerge.getReg(1));
2804 auto LS2 =
B.buildSub(
S32, LS, One);
2805 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2807 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2808 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2809 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2810 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2811 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2812 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2813 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2814 B.buildFLdexp(Dst, FVal, Scale);
2815 MI.eraseFromParent();
2835 unsigned Flags =
MI.getFlags();
2846 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2854 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2855 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2859 K0 =
B.buildFConstant(
2861 K1 =
B.buildFConstant(
2864 K0 =
B.buildFConstant(
2866 K1 =
B.buildFConstant(
2870 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2871 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2872 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2875 :
B.buildFPTOUI(
S32, FloorMul);
2876 auto Lo =
B.buildFPTOUI(
S32, Fma);
2880 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2882 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2885 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2886 MI.eraseFromParent();
2918 unsigned StartIdx =
Offset / 32;
2920 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2922 if (DstCount == 1) {
2924 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2929 for (
unsigned I = 0;
I < DstCount; ++
I)
2930 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2931 B.buildMergeLikeInstr(DstReg, MergeVec);
2934 MI.eraseFromParent();
2944 Register InsertSrc =
MI.getOperand(2).getReg();
2953 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2957 unsigned DstCount = DstSize / 32;
2958 unsigned InsertCount = InsertSize / 32;
2959 unsigned StartIdx =
Offset / 32;
2961 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
2964 for (
unsigned I = 0;
I < StartIdx; ++
I)
2967 if (InsertCount == 1) {
2971 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
2974 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
2975 for (
unsigned I = 0;
I < InsertCount; ++
I)
2979 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
2982 B.buildMergeLikeInstr(DstReg, MergeVec);
2984 MI.eraseFromParent();
3011 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3012 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3013 B.buildIntToPtr(Dst, IntElt);
3015 MI.eraseFromParent();
3022 std::optional<ValueAndVReg> MaybeIdxVal =
3026 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3029 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3030 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3035 MI.eraseFromParent();
3064 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3065 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3066 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3068 B.buildIntToPtr(Dst, IntVecDest);
3069 MI.eraseFromParent();
3076 std::optional<ValueAndVReg> MaybeIdxVal =
3081 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3084 if (IdxVal < NumElts) {
3086 for (
unsigned i = 0; i < NumElts; ++i)
3088 B.buildUnmerge(SrcRegs, Vec);
3090 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3091 B.buildMergeLikeInstr(Dst, SrcRegs);
3096 MI.eraseFromParent();
3107 unsigned Flags =
MI.getFlags();
3111 if (ST.hasTrigReducedRange()) {
3112 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3113 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3114 .addUse(MulVal.getReg(0))
3118 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3121 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3125 MI.eraseFromParent();
3133 unsigned GAFlags)
const {
3162 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3164 if (ST.has64BitLiterals()) {
3168 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3172 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3181 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3182 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3185 B.buildExtract(DstReg, PCReg, 0);
3195 if (RequiresHighHalf && ST.has64BitLiterals()) {
3197 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3198 B.buildInstr(AMDGPU::S_MOV_B64)
3213 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3216 B.buildInstr(AMDGPU::S_MOV_B32)
3221 if (RequiresHighHalf) {
3223 "Must provide a 64-bit pointer type!");
3226 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3228 B.buildInstr(AMDGPU::S_MOV_B32)
3239 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3241 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3245 if (AddrDst != DstReg)
3246 B.buildCast(DstReg, AddrDst);
3247 }
else if (AddrLo != DstReg) {
3250 B.buildCast(DstReg, AddrLo);
3267 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3271 Fn,
"local memory global used by non-kernel function",
3280 B.buildUndef(DstReg);
3281 MI.eraseFromParent();
3305 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3306 B.buildIntToPtr(DstReg, Sz);
3307 MI.eraseFromParent();
3313 MI.eraseFromParent();
3317 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3319 MI.eraseFromParent();
3327 MI.eraseFromParent();
3333 MI.eraseFromParent();
3349 if (Ty.getSizeInBits() == 32) {
3351 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3352 B.buildExtract(DstReg, Load, 0);
3354 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3356 MI.eraseFromParent();
3379 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3381 MI.getOperand(1).setReg(Cast.getReg(0));
3386 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3412 if (WideMemSize == ValSize) {
3418 MI.setMemRefs(MF, {WideMMO});
3424 if (ValSize > WideMemSize)
3431 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3432 B.buildTrunc(ValReg, WideLoad).getReg(0);
3439 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3440 B.buildExtract(ValReg, WideLoad, 0);
3444 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3445 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3449 MI.eraseFromParent();
3462 Register DataReg =
MI.getOperand(0).getReg();
3507 "this should not have been custom lowered");
3512 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3514 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3518 .setMemRefs(
MI.memoperands());
3520 MI.eraseFromParent();
3528 switch (
DefMI->getOpcode()) {
3529 case TargetOpcode::G_INTRINSIC: {
3531 case Intrinsic::amdgcn_frexp_mant:
3532 case Intrinsic::amdgcn_log:
3533 case Intrinsic::amdgcn_log_clamp:
3534 case Intrinsic::amdgcn_exp2:
3535 case Intrinsic::amdgcn_sqrt:
3543 case TargetOpcode::G_FSQRT:
3545 case TargetOpcode::G_FFREXP: {
3546 if (
DefMI->getOperand(0).getReg() == Src)
3550 case TargetOpcode::G_FPEXT: {
3571std::pair<Register, Register>
3573 unsigned Flags)
const {
3578 auto SmallestNormal =
B.buildFConstant(
3580 auto IsLtSmallestNormal =
3583 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3584 auto One =
B.buildFConstant(
F32, 1.0);
3586 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3587 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3589 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3602 LLT Ty =
B.getMRI()->getType(Dst);
3603 unsigned Flags =
MI.getFlags();
3608 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3609 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3610 .addUse(Ext.getReg(0))
3612 B.buildFPTrunc(Dst,
Log2, Flags);
3613 MI.eraseFromParent();
3621 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3624 MI.eraseFromParent();
3628 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3629 .addUse(ScaledInput)
3632 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3633 auto Zero =
B.buildFConstant(Ty, 0.0);
3635 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3636 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3638 MI.eraseFromParent();
3644 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3645 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3650 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3651 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3656 unsigned Flags =
MI.getFlags();
3669 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3671 B.buildFPTrunc(Dst, LogVal);
3676 MI.eraseFromParent();
3685 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3688 if (ST.hasFastFMAF32()) {
3690 const float c_log10 = 0x1.344134p-2f;
3691 const float cc_log10 = 0x1.09f79ep-26f;
3694 const float c_log = 0x1.62e42ep-1f;
3695 const float cc_log = 0x1.efa39ep-25f;
3697 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3698 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3702 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3703 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3704 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3705 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3706 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3709 const float ch_log10 = 0x1.344000p-2f;
3710 const float ct_log10 = 0x1.3509f6p-18f;
3713 const float ch_log = 0x1.62e000p-1f;
3714 const float ct_log = 0x1.0bfbe8p-15f;
3716 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3717 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3719 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3720 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3721 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3725 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3728 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3730 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3733 const bool IsFiniteOnly =
3736 if (!IsFiniteOnly) {
3739 auto Fabs =
B.buildFAbs(Ty,
Y);
3742 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3746 auto Zero =
B.buildFConstant(Ty, 0.0);
3748 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3749 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3750 B.buildFSub(Dst, R, Shift, Flags);
3752 B.buildCopy(Dst, R);
3755 MI.eraseFromParent();
3761 unsigned Flags)
const {
3762 const double Log2BaseInverted =
3765 LLT Ty =
B.getMRI()->getType(Dst);
3770 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3773 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3774 auto Zero =
B.buildFConstant(Ty, 0.0);
3776 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3777 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3779 if (ST.hasFastFMAF32())
3780 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3782 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3783 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3791 ?
B.buildFLog2(Ty, Src, Flags)
3792 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3795 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3796 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3807 unsigned Flags =
MI.getFlags();
3808 LLT Ty =
B.getMRI()->getType(Dst);
3818 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3819 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3820 .addUse(Ext.getReg(0))
3822 B.buildFPTrunc(Dst,
Log2, Flags);
3823 MI.eraseFromParent();
3833 MI.eraseFromParent();
3841 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3843 RangeCheckConst, Flags);
3845 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3846 auto Zero =
B.buildFConstant(Ty, 0.0);
3847 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3848 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3850 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3851 .addUse(AddInput.getReg(0))
3854 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3855 auto One =
B.buildFConstant(Ty, 1.0);
3856 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3857 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3858 MI.eraseFromParent();
3863 const SrcOp &Src,
unsigned Flags) {
3864 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3867 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3868 .addUse(Src.getReg())
3871 return B.buildFExp2(Dst, Src, Flags);
3877 bool IsExp10)
const {
3878 LLT Ty =
B.getMRI()->getType(
X);
3882 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3883 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3890 LLT Ty =
B.getMRI()->getType(Dst);
3897 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3900 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3901 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3902 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3905 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3907 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3908 .addUse(ExpInput.getReg(0))
3911 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3912 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3913 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3919 unsigned Flags)
const {
3920 LLT Ty =
B.getMRI()->getType(Dst);
3925 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3926 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3928 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3929 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3930 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3931 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3932 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3942 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3946 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3947 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3948 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3950 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3951 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3953 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3954 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3955 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3956 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3958 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3959 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3960 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3962 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3981 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
3983 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
3985 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
3987 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
3988 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
3989 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
3990 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
3992 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
3993 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
3994 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
3995 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
3997 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
3998 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
3999 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4000 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4001 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4003 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4004 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4005 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4006 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4009 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4010 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4011 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4013 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4014 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4015 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4016 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4017 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4021 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4022 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4024 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4026 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4028 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4030 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4032 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4033 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4034 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4035 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4037 auto One =
B.buildFConstant(
S64, 1.0);
4038 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4039 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4042 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4043 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4050 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4057 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4059 MI.eraseFromParent();
4067 const unsigned Flags =
MI.getFlags();
4079 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4087 MI.eraseFromParent();
4098 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4101 B.buildFPTrunc(Dst, Lowered, Flags);
4102 MI.eraseFromParent();
4113 MI.eraseFromParent();
4141 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4144 if (ST.hasFastFMAF32()) {
4146 const float cc_exp = 0x1.4ae0bep-26f;
4147 const float c_exp10 = 0x1.a934f0p+1f;
4148 const float cc_exp10 = 0x1.2f346ep-24f;
4150 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4151 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4152 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4153 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4155 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4156 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4158 const float ch_exp = 0x1.714000p+0f;
4159 const float cl_exp = 0x1.47652ap-12f;
4161 const float ch_exp10 = 0x1.a92000p+1f;
4162 const float cl_exp10 = 0x1.4f0978p-11f;
4164 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4165 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4166 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4168 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4169 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4171 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4172 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4175 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4176 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4179 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4182 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4183 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4186 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4187 .addUse(
A.getReg(0))
4189 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4191 auto UnderflowCheckConst =
4192 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4193 auto Zero =
B.buildFConstant(Ty, 0.0);
4197 R =
B.buildSelect(Ty, Underflow, Zero, R);
4200 auto OverflowCheckConst =
4201 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4206 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4209 B.buildCopy(Dst, R);
4210 MI.eraseFromParent();
4219 unsigned Flags =
MI.getFlags();
4220 LLT Ty =
B.getMRI()->getType(Dst);
4225 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4226 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4227 .addUse(Log.getReg(0))
4230 B.buildFExp2(Dst,
Mul, Flags);
4231 }
else if (Ty == F16) {
4233 auto Log =
B.buildFLog2(F16, Src0, Flags);
4234 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4235 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4236 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4237 .addUse(Ext0.getReg(0))
4238 .addUse(Ext1.getReg(0))
4240 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4244 MI.eraseFromParent();
4252 ModSrc = SrcFNeg->getOperand(1).getReg();
4254 ModSrc = SrcFAbs->getOperand(1).getReg();
4256 ModSrc = SrcFAbs->getOperand(1).getReg();
4267 Register OrigSrc =
MI.getOperand(1).getReg();
4268 unsigned Flags =
MI.getFlags();
4270 "this should not have been custom lowered");
4280 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4300 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4302 B.buildFMinNum(Min, Fract, Const, Flags);
4307 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4310 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4311 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4313 MI.eraseFromParent();
4329 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4331 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4332 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4335 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4336 B.buildBitcast(Dst,
Merge);
4338 MI.eraseFromParent();
4355 bool UsePartialMad64_32,
4356 bool SeparateOddAlignedProducts)
const {
4371 auto getZero32 = [&]() ->
Register {
4373 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4376 auto getZero64 = [&]() ->
Register {
4378 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4383 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4394 if (CarryIn.empty())
4397 bool HaveCarryOut =
true;
4399 if (CarryIn.size() == 1) {
4401 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4405 CarryAccum = getZero32();
4407 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4408 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4410 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4415 LocalAccum = getZero32();
4416 HaveCarryOut =
false;
4421 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4422 LocalAccum =
Add.getReg(0);
4436 auto buildMadChain =
4439 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4440 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4447 if (LocalAccum.size() == 1 &&
4448 (!UsePartialMad64_32 || !CarryIn.empty())) {
4451 unsigned j1 = DstIndex - j0;
4452 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4456 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4458 LocalAccum[0] =
Mul.getReg(0);
4460 if (CarryIn.empty()) {
4461 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4464 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4470 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4474 if (j0 <= DstIndex) {
4475 bool HaveSmallAccum =
false;
4478 if (LocalAccum[0]) {
4479 if (LocalAccum.size() == 1) {
4480 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4481 HaveSmallAccum =
true;
4482 }
else if (LocalAccum[1]) {
4483 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4484 HaveSmallAccum =
false;
4486 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4487 HaveSmallAccum =
true;
4490 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4492 HaveSmallAccum =
true;
4496 unsigned j1 = DstIndex - j0;
4497 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4501 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4502 {Src0[j0], Src1[j1], Tmp});
4503 Tmp = Mad.getReg(0);
4504 if (!HaveSmallAccum)
4505 CarryOut.push_back(Mad.getReg(1));
4506 HaveSmallAccum =
false;
4509 }
while (j0 <= DstIndex);
4511 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4512 LocalAccum[0] = Unmerge.getReg(0);
4513 if (LocalAccum.size() > 1)
4514 LocalAccum[1] = Unmerge.getReg(1);
4541 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4542 Carry OddCarryIn = std::move(OddCarry);
4543 Carry EvenCarryIn = std::move(EvenCarry);
4548 if (2 * i < Accum.
size()) {
4549 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4550 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4555 if (!SeparateOddAlignedProducts) {
4556 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4557 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4559 bool IsHighest = 2 * i >= Accum.
size();
4562 .take_front(IsHighest ? 1 : 2);
4563 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4569 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4571 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4573 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4576 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4579 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4580 Lo->getOperand(1).getReg());
4581 Accum[2 * i] =
Hi.getReg(0);
4582 SeparateOddCarry =
Hi.getReg(1);
4589 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4590 EvenCarryIn.push_back(CarryOut);
4592 if (2 * i < Accum.
size()) {
4593 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4594 OddCarry.push_back(CarryOut);
4606 assert(ST.hasMad64_32());
4607 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4619 unsigned Size = Ty.getSizeInBits();
4620 if (ST.hasVectorMulU64() &&
Size == 64)
4623 unsigned NumParts =
Size / 32;
4635 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4639 for (
unsigned i = 0; i < NumParts; ++i) {
4643 B.buildUnmerge(Src0Parts, Src0);
4644 B.buildUnmerge(Src1Parts, Src1);
4647 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4648 SeparateOddAlignedProducts);
4650 B.buildMergeLikeInstr(DstReg, AccumRegs);
4651 MI.eraseFromParent();
4666 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4667 ? AMDGPU::G_AMDGPU_FFBH_U32
4668 : AMDGPU::G_AMDGPU_FFBL_B32;
4669 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4672 MI.eraseFromParent();
4682 TypeSize NumBits = SrcTy.getSizeInBits();
4686 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4687 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4688 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4689 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4690 B.buildTrunc(Dst, Ctlz);
4691 MI.eraseFromParent();
4702 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4703 unsigned BitWidth = SrcTy.getSizeInBits();
4705 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4707 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4708 MI.eraseFromParent();
4714 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4717 return ConstVal == -1;
4724 Register CondDef =
MI.getOperand(0).getReg();
4743 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4752 UncondBrTarget = &*NextMBB;
4754 if (
Next->getOpcode() != AMDGPU::G_BR)
4773 *ArgRC,
B.getDebugLoc(), ArgTy);
4777 const unsigned Mask = Arg->
getMask();
4785 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4786 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4789 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4791 B.buildCopy(DstReg, LiveIn);
4801 if (!ST.hasClusters()) {
4804 MI.eraseFromParent();
4824 auto One =
B.buildConstant(
S32, 1);
4825 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4826 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4827 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4834 B.buildCopy(DstReg, GlobalIdXYZ);
4835 MI.eraseFromParent();
4839 B.buildCopy(DstReg, ClusterIdXYZ);
4840 MI.eraseFromParent();
4845 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4847 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4848 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4850 .addImm(ClusterIdField);
4851 auto Zero =
B.buildConstant(
S32, 0);
4854 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4855 MI.eraseFromParent();
4897 auto LoadConstant = [&](
unsigned N) {
4898 B.buildConstant(DstReg,
N);
4902 if (ST.hasArchitectedSGPRs() &&
4909 Arg = &WorkGroupIDX;
4910 ArgRC = &AMDGPU::SReg_32RegClass;
4914 Arg = &WorkGroupIDY;
4915 ArgRC = &AMDGPU::SReg_32RegClass;
4919 Arg = &WorkGroupIDZ;
4920 ArgRC = &AMDGPU::SReg_32RegClass;
4924 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4925 return LoadConstant(0);
4926 Arg = &ClusterWorkGroupIDX;
4927 ArgRC = &AMDGPU::SReg_32RegClass;
4931 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4932 return LoadConstant(0);
4933 Arg = &ClusterWorkGroupIDY;
4934 ArgRC = &AMDGPU::SReg_32RegClass;
4938 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4939 return LoadConstant(0);
4940 Arg = &ClusterWorkGroupIDZ;
4941 ArgRC = &AMDGPU::SReg_32RegClass;
4946 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4947 Arg = &ClusterWorkGroupMaxIDX;
4948 ArgRC = &AMDGPU::SReg_32RegClass;
4953 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4954 Arg = &ClusterWorkGroupMaxIDY;
4955 ArgRC = &AMDGPU::SReg_32RegClass;
4960 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4961 Arg = &ClusterWorkGroupMaxIDZ;
4962 ArgRC = &AMDGPU::SReg_32RegClass;
4966 Arg = &ClusterWorkGroupMaxFlatID;
4967 ArgRC = &AMDGPU::SReg_32RegClass;
4982 return LoadConstant(0);
4987 B.buildUndef(DstReg);
4991 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5003 MI.eraseFromParent();
5009 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5010 MI.eraseFromParent();
5017 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5031 B.buildUndef(DstReg);
5032 MI.eraseFromParent();
5036 if (Arg->isMasked()) {
5050 MI.eraseFromParent();
5065 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5074 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5082 Align Alignment)
const {
5086 "unexpected kernarg parameter type");
5093 MI.eraseFromParent();
5128 auto FloatY =
B.buildUITOFP(
S32,
Y);
5129 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5131 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5132 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5135 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5136 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5137 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5140 auto Q =
B.buildUMulH(
S32,
X, Z);
5141 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5144 auto One =
B.buildConstant(
S32, 1);
5147 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5153 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5156 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5175 auto Unmerge =
B.buildUnmerge(
S32, Val);
5177 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5178 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5180 auto Mad =
B.buildFMAD(
5184 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5185 auto Mul1 =
B.buildFMul(
5189 auto Mul2 =
B.buildFMul(
5191 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5194 auto Mad2 =
B.buildFMAD(
5198 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5199 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5201 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5216 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5218 auto Zero64 =
B.buildConstant(
S64, 0);
5219 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5221 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5222 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5224 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5225 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5226 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5228 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5229 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5230 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5232 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5233 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5234 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5235 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5236 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5238 auto Zero32 =
B.buildConstant(
S32, 0);
5239 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5240 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5241 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5243 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5244 Register NumerLo = UnmergeNumer.getReg(0);
5245 Register NumerHi = UnmergeNumer.getReg(1);
5247 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5248 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5249 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5250 Register Mul3_Lo = UnmergeMul3.getReg(0);
5251 Register Mul3_Hi = UnmergeMul3.getReg(1);
5252 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5253 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5254 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5255 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5257 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5258 Register DenomLo = UnmergeDenom.getReg(0);
5259 Register DenomHi = UnmergeDenom.getReg(1);
5262 auto C1 =
B.buildSExt(
S32, CmpHi);
5265 auto C2 =
B.buildSExt(
S32, CmpLo);
5268 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5275 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5276 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5277 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5278 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5280 auto One64 =
B.buildConstant(
S64, 1);
5281 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5287 auto C6 =
B.buildSelect(
5291 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5292 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5294 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5295 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5296 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5302 auto Sel1 =
B.buildSelect(
5309 auto Sel2 =
B.buildSelect(
5320 switch (
MI.getOpcode()) {
5323 case AMDGPU::G_UDIV: {
5324 DstDivReg =
MI.getOperand(0).getReg();
5327 case AMDGPU::G_UREM: {
5328 DstRemReg =
MI.getOperand(0).getReg();
5331 case AMDGPU::G_UDIVREM: {
5332 DstDivReg =
MI.getOperand(0).getReg();
5333 DstRemReg =
MI.getOperand(1).getReg();
5340 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5341 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5342 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5352 MI.eraseFromParent();
5363 if (Ty !=
S32 && Ty !=
S64)
5366 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5367 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5368 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5370 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5371 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5372 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5374 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5375 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5377 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5378 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5380 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5381 switch (
MI.getOpcode()) {
5384 case AMDGPU::G_SDIV: {
5385 DstDivReg =
MI.getOperand(0).getReg();
5389 case AMDGPU::G_SREM: {
5390 DstRemReg =
MI.getOperand(0).getReg();
5394 case AMDGPU::G_SDIVREM: {
5395 DstDivReg =
MI.getOperand(0).getReg();
5396 DstRemReg =
MI.getOperand(1).getReg();
5409 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5410 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5411 B.buildSub(DstDivReg, SignXor, Sign);
5415 auto Sign = LHSign.getReg(0);
5416 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5417 B.buildSub(DstRemReg, SignXor, Sign);
5420 MI.eraseFromParent();
5436 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5447 if (CLHS->isExactlyValue(1.0)) {
5448 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5452 MI.eraseFromParent();
5457 if (CLHS->isExactlyValue(-1.0)) {
5458 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5459 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5460 .addUse(FNeg.getReg(0))
5463 MI.eraseFromParent();
5470 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5475 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5478 B.buildFMul(Res, LHS, RCP, Flags);
5480 MI.eraseFromParent();
5495 if (!AllowInaccurateRcp)
5503 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5505 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5506 auto One =
B.buildFConstant(ResTy, 1.0);
5508 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5512 R =
B.buildFNeg(ResTy, R);
5514 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5515 R =
B.buildFMA(ResTy, Tmp0, R, R);
5517 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5518 R =
B.buildFMA(ResTy, Tmp1, R, R);
5522 B.buildCopy(Res, R);
5523 MI.eraseFromParent();
5527 auto Ret =
B.buildFMul(ResTy,
X, R);
5528 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5530 B.buildFMA(Res, Tmp2, R, Ret);
5531 MI.eraseFromParent();
5563 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5564 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5565 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5566 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5567 .addUse(RHSExt.getReg(0))
5569 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5571 if (ST.hasMadMacF32Insts()) {
5572 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5573 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5574 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5576 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5577 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5578 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5580 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5581 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5582 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5583 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5584 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5585 .addUse(RDst.getReg(0))
5590 MI.eraseFromParent();
5603 unsigned SPDenormMode =
5606 if (ST.hasDenormModeInst()) {
5608 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5610 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5611 B.buildInstr(AMDGPU::S_DENORM_MODE)
5612 .addImm(NewDenormModeValue);
5615 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5616 .addImm(SPDenormMode)
5638 auto One =
B.buildFConstant(
S32, 1.0f);
5640 auto DenominatorScaled =
5641 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5646 auto NumeratorScaled =
5647 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5653 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5654 .addUse(DenominatorScaled.getReg(0))
5656 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5659 const bool HasDynamicDenormals =
5664 if (!PreservesDenormals) {
5665 if (HasDynamicDenormals) {
5667 B.buildInstr(AMDGPU::S_GETREG_B32)
5668 .addDef(SavedSPDenormMode)
5674 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5675 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5676 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5677 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5678 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5679 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5681 if (!PreservesDenormals) {
5682 if (HasDynamicDenormals) {
5683 assert(SavedSPDenormMode);
5684 B.buildInstr(AMDGPU::S_SETREG_B32)
5685 .addReg(SavedSPDenormMode)
5691 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5692 .addUse(Fma4.getReg(0))
5693 .addUse(Fma1.getReg(0))
5694 .addUse(Fma3.getReg(0))
5695 .addUse(NumeratorScaled.getReg(1))
5698 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5699 .addUse(Fmas.getReg(0))
5704 MI.eraseFromParent();
5723 auto One =
B.buildFConstant(
S64, 1.0);
5725 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5731 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5733 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5734 .addUse(DivScale0.getReg(0))
5737 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5738 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5739 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5741 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5747 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5748 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5749 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5752 if (!ST.hasUsableDivScaleConditionOutput()) {
5758 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5759 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5760 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5761 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5764 Scale1Unmerge.getReg(1));
5766 Scale0Unmerge.getReg(1));
5767 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5769 Scale = DivScale1.getReg(1);
5772 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5773 .addUse(Fma4.getReg(0))
5774 .addUse(Fma3.getReg(0))
5775 .addUse(
Mul.getReg(0))
5779 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5780 .addUse(Fmas.getReg(0))
5785 MI.eraseFromParent();
5800 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5803 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5807 if (ST.hasFractBug()) {
5808 auto Fabs =
B.buildFAbs(Ty, Val);
5812 auto Zero =
B.buildConstant(InstrExpTy, 0);
5813 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5814 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5817 B.buildCopy(Res0, Mant);
5818 B.buildSExtOrTrunc(Res1, Exp);
5820 MI.eraseFromParent();
5835 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5838 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5839 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5840 auto C2 =
B.buildFConstant(
S32, 1.0f);
5843 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5845 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5847 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5848 .addUse(Mul0.getReg(0))
5851 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5853 B.buildFMul(Res, Sel, Mul1, Flags);
5855 MI.eraseFromParent();
5864 unsigned Flags =
MI.getFlags();
5865 assert(!ST.has16BitInsts());
5867 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5868 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5869 .addUse(Ext.getReg(0))
5871 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5872 MI.eraseFromParent();
5882 const unsigned Flags =
MI.getFlags();
5891 MI.eraseFromParent();
5895 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5897 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5898 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5899 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5904 .addUse(SqrtX.getReg(0))
5907 auto NegOne =
B.buildConstant(I32, -1);
5908 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5910 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5911 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5913 auto PosOne =
B.buildConstant(I32, 1);
5914 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5916 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5917 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5919 auto Zero =
B.buildFConstant(
F32, 0.0f);
5923 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5927 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5930 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5931 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5933 auto Half =
B.buildFConstant(
F32, 0.5f);
5934 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5935 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5936 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5937 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5938 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5939 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5940 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5941 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5944 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5946 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5948 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5951 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5953 MI.eraseFromParent();
5988 unsigned Flags =
MI.getFlags();
5993 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5995 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
5999 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6000 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6001 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6004 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6006 auto Half =
B.buildFConstant(
F64, 0.5);
6007 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6008 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6010 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6011 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6013 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6014 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6016 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6017 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6019 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6021 Register SqrtRet = SqrtS2.getReg(0);
6023 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6024 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6025 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6028 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6029 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6030 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6035 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6044 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6046 MI.eraseFromParent();
6077 auto Flags =
MI.getFlags();
6089 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6099 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6100 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6105 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6107 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6108 MI.eraseFromParent();
6120 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6121 IID == Intrinsic::amdgcn_permlanex16;
6122 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6123 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6127 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6129 case Intrinsic::amdgcn_readfirstlane:
6130 case Intrinsic::amdgcn_permlane64:
6131 return LaneOp.getReg(0);
6132 case Intrinsic::amdgcn_readlane:
6133 case Intrinsic::amdgcn_set_inactive:
6134 case Intrinsic::amdgcn_set_inactive_chain_arg:
6135 return LaneOp.addUse(Src1).getReg(0);
6136 case Intrinsic::amdgcn_writelane:
6137 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6138 case Intrinsic::amdgcn_permlane16:
6139 case Intrinsic::amdgcn_permlanex16: {
6141 int64_t Src4 =
MI.getOperand(6).getImm();
6142 int64_t Src5 =
MI.getOperand(7).getImm();
6143 return LaneOp.addUse(Src1)
6150 case Intrinsic::amdgcn_mov_dpp8:
6151 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6152 case Intrinsic::amdgcn_update_dpp:
6153 return LaneOp.addUse(Src1)
6154 .addImm(
MI.getOperand(4).getImm())
6155 .addImm(
MI.getOperand(5).getImm())
6156 .addImm(
MI.getOperand(6).getImm())
6157 .addImm(
MI.getOperand(7).getImm())
6167 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6168 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6169 Src1 =
MI.getOperand(3).getReg();
6170 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6171 Src2 =
MI.getOperand(4).getReg();
6176 unsigned Size = Ty.getSizeInBits();
6178 unsigned SplitSize = 32;
6179 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6180 ST.hasDPALU_DPP() &&
6184 if (
Size == SplitSize) {
6190 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6192 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6195 if (IID == Intrinsic::amdgcn_writelane)
6198 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6199 B.buildTrunc(DstReg, LaneOpDst);
6200 MI.eraseFromParent();
6204 if (
Size % SplitSize != 0)
6208 bool NeedsBitcast =
false;
6209 if (Ty.isVector()) {
6212 if (EltSize == SplitSize) {
6213 PartialResTy = EltTy;
6214 }
else if (EltSize == 16 || EltSize == 32) {
6215 unsigned NElem = SplitSize / EltSize;
6219 NeedsBitcast =
true;
6224 unsigned NumParts =
Size / SplitSize;
6228 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6229 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6231 if (IID == Intrinsic::amdgcn_writelane)
6232 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6234 for (
unsigned i = 0; i < NumParts; ++i) {
6235 Src0 = Src0Parts.
getReg(i);
6237 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6238 Src1 = Src1Parts.
getReg(i);
6240 if (IID == Intrinsic::amdgcn_writelane)
6241 Src2 = Src2Parts.
getReg(i);
6243 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6247 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6250 B.buildMergeLikeInstr(DstReg, PartialRes);
6252 MI.eraseFromParent();
6260 ST.getTargetLowering()->getImplicitParameterOffset(
6270 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6271 B.buildConstant(IdxTy,
Offset).getReg(0));
6282 Register Pointer =
MI.getOperand(2).getReg();
6284 Register NumRecords =
MI.getOperand(4).getReg();
6290 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6292 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6294 if (ST.has45BitNumRecordsBufferResource()) {
6299 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6300 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6301 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6302 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6306 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6307 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6308 auto ExtShiftedStride =
6309 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6310 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6311 auto ExtShiftedFlags =
6312 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6313 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6315 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6316 B.buildMergeValues(Result, {LowHalf, HighHalf});
6318 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6319 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6320 auto LowHalf = Unmerge.getReg(0);
6321 auto HighHalf = Unmerge.getReg(1);
6323 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6324 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6325 auto ShiftConst =
B.buildConstant(
S32, 16);
6326 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6327 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6328 Register NewHighHalfReg = NewHighHalf.getReg(0);
6329 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6332 MI.eraseFromParent();
6349 MI.eraseFromParent();
6357 std::optional<uint32_t> KnownSize =
6359 if (KnownSize.has_value())
6360 B.buildConstant(DstReg, *KnownSize);
6378 MI.eraseFromParent();
6385 unsigned AddrSpace)
const {
6387 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6391 ST.hasGloballyAddressableScratch()) {
6393 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6394 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6396 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6398 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6400 B.buildConstant(
S32, 1u << 26));
6405 MI.eraseFromParent();
6415std::pair<Register, unsigned>
6427 bool CheckNUW = ST.hasGFX1250Insts();
6429 MRI, OrigOffset,
nullptr, CheckNUW);
6433 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6443 unsigned Overflow = ImmOffset & ~MaxImm;
6444 ImmOffset -= Overflow;
6445 if ((int32_t)Overflow < 0) {
6446 Overflow += ImmOffset;
6450 if (Overflow != 0) {
6452 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6454 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6455 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6460 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6462 return std::pair(BaseReg, ImmOffset);
6469 bool ImageStore)
const {
6475 if (ST.hasUnpackedD16VMem()) {
6476 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6479 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6480 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6488 if (ImageStore && ST.hasImageStoreD16Bug()) {
6491 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6493 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6500 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6501 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6503 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6511 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6512 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6514 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6531 bool IsFormat)
const {
6543 VData =
B.buildBitcast(Ty, VData).getReg(0);
6551 if (Ty.isVector()) {
6552 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6564 bool IsFormat)
const {
6571 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6586 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6589 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6593 VIndex =
MI.getOperand(3).getReg();
6596 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6599 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6600 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6604 Format =
MI.getOperand(5 + OpOffset).getImm();
6608 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6614 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6615 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6616 }
else if (IsFormat) {
6617 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6618 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6622 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6625 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6628 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6633 auto MIB =
B.buildInstr(
Opc)
6644 MIB.addImm(AuxiliaryData)
6645 .addImm(HasVIndex ? -1 : 0)
6646 .addMemOperand(MMO);
6648 MI.eraseFromParent();
6654 unsigned ImmOffset,
unsigned Format,
6657 auto MIB =
B.buildInstr(
Opc)
6668 MIB.addImm(AuxiliaryData)
6669 .addImm(HasVIndex ? -1 : 0)
6670 .addMemOperand(MMO);
6676 bool IsTyped)
const {
6690 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6691 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6693 StatusDst =
MI.getOperand(1).getReg();
6698 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6701 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6704 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6707 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6710 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6713 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6714 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6718 Format =
MI.getOperand(5 + OpOffset).getImm();
6722 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6732 Dst =
MI.getOperand(0).getReg();
6733 B.setInsertPt(
B.getMBB(),
MI);
6740 Dst =
MI.getOperand(0).getReg();
6741 B.setInsertPt(
B.getMBB(),
MI);
6745 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6746 const bool Unpacked = ST.hasUnpackedD16VMem();
6756 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6757 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6758 }
else if (IsFormat) {
6762 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6764 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6765 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6770 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6771 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6774 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6775 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6778 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6779 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6785 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6786 unsigned NumLoadDWords = NumValueDWords + 1;
6788 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6790 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6792 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6793 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6794 B.buildTrunc(Dst, ExtDst);
6795 }
else if (NumValueDWords == 1) {
6796 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6799 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6800 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6802 B.buildUnmerge(LoadElts, LoadDstReg);
6804 B.buildMergeLikeInstr(Dst, LoadElts);
6807 (IsD16 && !Ty.isVector())) {
6808 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6810 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6811 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6812 B.buildTrunc(Dst, LoadDstReg);
6813 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6815 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6817 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6818 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6820 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6822 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6823 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6824 B.buildMergeLikeInstr(Dst, Repack);
6827 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6830 MI.eraseFromParent();
6836 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6837 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6838 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6839 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6840 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6841 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6842 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6843 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6844 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6845 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6846 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6847 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6848 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6849 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6850 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6851 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6852 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6853 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6855 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6856 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6857 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6858 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6859 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6860 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6861 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6862 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6863 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6865 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6866 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6867 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6868 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6871 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6873 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6876 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6878 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6881 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6883 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6886 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6888 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6891 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6934 const bool IsCmpSwap =
6935 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6936 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6937 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6938 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6949 CmpVal =
MI.getOperand(3).getReg();
6954 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6955 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6958 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6961 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6964 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6967 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6968 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6969 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6988 .addImm(AuxiliaryData)
6989 .addImm(HasVIndex ? -1 : 0)
6990 .addMemOperand(MMO);
6992 MI.eraseFromParent();
7002 bool IsA16,
bool IsG16) {
7018 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7023 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7027 "Bias needs to be converted to 16 bit in A16 mode");
7029 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7035 if (((
I + 1) >= EndIdx) ||
7042 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7044 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7049 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7060 int DimIdx,
int NumVAddrs) {
7064 for (
int I = 0;
I != NumVAddrs; ++
I) {
7066 if (
SrcOp.isReg()) {
7072 int NumAddrRegs = AddrRegs.
size();
7073 if (NumAddrRegs != 1) {
7076 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7079 for (
int I = 1;
I != NumVAddrs; ++
I) {
7082 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7104 const unsigned NumDefs =
MI.getNumExplicitDefs();
7105 const unsigned ArgOffset = NumDefs + 1;
7106 bool IsTFE = NumDefs == 2;
7124 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7128 const bool IsAtomicPacked16Bit =
7129 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7130 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7138 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7139 const bool IsA16 = AddrTy ==
S16;
7140 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7143 if (!BaseOpcode->
Atomic) {
7144 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7147 }
else if (DMask != 0) {
7149 }
else if (!IsTFE && !BaseOpcode->
Store) {
7151 B.buildUndef(
MI.getOperand(0));
7152 MI.eraseFromParent();
7160 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7161 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7162 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7163 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7164 unsigned NewOpcode = LoadOpcode;
7165 if (BaseOpcode->
Store)
7166 NewOpcode = StoreOpcode;
7168 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7171 MI.setDesc(
B.getTII().get(NewOpcode));
7175 if (IsTFE && DMask == 0) {
7178 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7181 if (BaseOpcode->
Atomic) {
7186 if (Ty.isVector() && !IsAtomicPacked16Bit)
7193 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7194 MI.getOperand(2).setReg(
Concat.getReg(0));
7195 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7199 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7202 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7208 if (IsA16 && !ST.hasA16()) {
7213 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7214 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7216 if (IsA16 || IsG16) {
7224 const bool UseNSA = ST.hasNSAEncoding() &&
7225 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7226 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7227 const bool UsePartialNSA =
7228 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7230 if (UsePartialNSA) {
7234 auto Concat =
B.buildConcatVectors(
7235 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7236 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7237 PackedRegs.
resize(NSAMaxSize);
7238 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7240 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7241 PackedRegs[0] =
Concat.getReg(0);
7245 const unsigned NumPacked = PackedRegs.
size();
7248 if (!
SrcOp.isReg()) {
7258 SrcOp.setReg(AMDGPU::NoRegister);
7275 const bool UseNSA = ST.hasNSAEncoding() &&
7276 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7277 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7278 const bool UsePartialNSA =
7279 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7281 if (UsePartialNSA) {
7283 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7285 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7300 if (!Ty.isVector() || !IsD16)
7304 if (RepackedReg != VData) {
7305 MI.getOperand(1).setReg(RepackedReg);
7313 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7316 if (NumElts < DMaskLanes)
7319 if (NumElts > 4 || DMaskLanes > 4)
7329 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7330 const LLT AdjustedTy =
7346 if (IsD16 && ST.hasUnpackedD16VMem()) {
7353 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7354 unsigned RoundedSize = 32 * RoundedElts;
7358 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7363 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7369 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7373 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7374 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7378 MI.getOperand(0).setReg(NewResultReg);
7386 Dst1Reg =
MI.getOperand(1).getReg();
7391 MI.removeOperand(1);
7395 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7404 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7406 if (ResultNumRegs == 1) {
7408 ResultRegs[0] = NewResultReg;
7411 for (
int I = 0;
I != NumDataRegs; ++
I)
7413 B.buildUnmerge(ResultRegs, NewResultReg);
7418 ResultRegs.
resize(NumDataRegs);
7423 if (IsD16 && !Ty.isVector()) {
7424 B.buildTrunc(DstReg, ResultRegs[0]);
7429 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7430 B.buildBitcast(DstReg, ResultRegs[0]);
7442 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7444 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7445 }
else if (ST.hasUnpackedD16VMem()) {
7447 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7451 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7455 for (
int I = 0;
I != NumElts; ++
I)
7462 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7463 B.buildBuildVector(DstReg, ResultRegs);
7467 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7468 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7474 if (ResultRegs.
size() == 1) {
7475 NewResultReg = ResultRegs[0];
7476 }
else if (ResultRegs.
size() == 2) {
7478 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7486 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7488 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7493 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7494 B.buildConcatVectors(DstReg, ResultRegs);
7503 Register OrigDst =
MI.getOperand(0).getReg();
7505 LLT Ty =
B.getMRI()->getType(OrigDst);
7506 unsigned Size = Ty.getSizeInBits();
7509 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7511 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7512 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7515 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7517 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7526 B.setInsertPt(
B.getMBB(),
MI);
7531 B.setInsertPt(
B.getMBB(),
MI);
7537 MI.setDesc(
B.getTII().get(
Opc));
7538 MI.removeOperand(1);
7541 const unsigned MemSize = (
Size + 7) / 8;
7542 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7549 MI.addMemOperand(MF, MMO);
7550 if (Dst != OrigDst) {
7551 MI.getOperand(0).setReg(Dst);
7552 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7553 B.buildTrunc(OrigDst, Dst);
7575 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7576 MI.removeOperand(0);
7586 if (!ST.hasTrapHandler() ||
7590 return ST.supportsGetDoorbellID() ?
7603 MI.eraseFromParent();
7613 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7615 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7619 MI.eraseFromParent();
7628 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7635 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7655 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7658 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7659 B.buildCopy(SGPR01, Temp);
7660 B.buildInstr(AMDGPU::S_TRAP)
7663 MI.eraseFromParent();
7674 B.buildCopy(SGPR01, LiveIn);
7675 B.buildInstr(AMDGPU::S_TRAP)
7679 MI.eraseFromParent();
7688 if (ST.hasPrivEnabledTrap2NopBug()) {
7689 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7691 MI.eraseFromParent();
7695 B.buildInstr(AMDGPU::S_TRAP)
7697 MI.eraseFromParent();
7706 if (!ST.hasTrapHandler() ||
7710 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7713 B.buildInstr(AMDGPU::S_TRAP)
7717 MI.eraseFromParent();
7730 Register NodePtr =
MI.getOperand(2).getReg();
7731 Register RayExtent =
MI.getOperand(3).getReg();
7732 Register RayOrigin =
MI.getOperand(4).getReg();
7734 Register RayInvDir =
MI.getOperand(6).getReg();
7737 if (!ST.hasGFX10_AEncoding()) {
7740 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7749 const unsigned NumVDataDwords = 4;
7750 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7751 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7753 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7755 const unsigned BaseOpcodes[2][2] = {
7756 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7757 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7758 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7762 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7763 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7764 : AMDGPU::MIMGEncGfx10NSA,
7765 NumVDataDwords, NumVAddrDwords);
7769 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7770 : AMDGPU::MIMGEncGfx10Default,
7771 NumVDataDwords, NumVAddrDwords);
7776 if (UseNSA && IsGFX11Plus) {
7778 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7779 auto Merged =
B.buildMergeLikeInstr(
7780 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7781 Ops.push_back(Merged.getReg(0));
7784 Ops.push_back(NodePtr);
7785 Ops.push_back(RayExtent);
7786 packLanes(RayOrigin);
7789 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7790 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7791 auto MergedDir =
B.buildMergeLikeInstr(
7794 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7795 UnmergeRayDir.getReg(0)}))
7798 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7799 UnmergeRayDir.getReg(1)}))
7802 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7803 UnmergeRayDir.getReg(2)}))
7805 Ops.push_back(MergedDir.getReg(0));
7808 packLanes(RayInvDir);
7812 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7813 Ops.push_back(Unmerge.getReg(0));
7814 Ops.push_back(Unmerge.getReg(1));
7816 Ops.push_back(NodePtr);
7818 Ops.push_back(RayExtent);
7821 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7822 Ops.push_back(Unmerge.getReg(0));
7823 Ops.push_back(Unmerge.getReg(1));
7824 Ops.push_back(Unmerge.getReg(2));
7827 packLanes(RayOrigin);
7829 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7830 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7834 B.buildMergeLikeInstr(R1,
7835 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7836 B.buildMergeLikeInstr(
7837 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7838 B.buildMergeLikeInstr(
7839 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7845 packLanes(RayInvDir);
7852 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7854 Ops.push_back(MergedOps);
7857 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7866 .addImm(IsA16 ? 1 : 0)
7869 MI.eraseFromParent();
7879 Register DstOrigin =
MI.getOperand(1).getReg();
7881 Register NodePtr =
MI.getOperand(4).getReg();
7882 Register RayExtent =
MI.getOperand(5).getReg();
7883 Register InstanceMask =
MI.getOperand(6).getReg();
7884 Register RayOrigin =
MI.getOperand(7).getReg();
7886 Register Offsets =
MI.getOperand(9).getReg();
7887 Register TDescr =
MI.getOperand(10).getReg();
7889 if (!ST.hasBVHDualAndBVH8Insts()) {
7892 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7897 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7898 const unsigned NumVDataDwords = 10;
7899 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7901 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7902 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7903 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7906 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7907 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7909 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7910 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7916 .addUse(RayExtentInstanceMaskVec.getReg(0))
7923 MI.eraseFromParent();
7932 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7933 MI.eraseFromParent();
7940 if (!ST.hasArchitectedSGPRs())
7944 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7945 auto LSB =
B.buildConstant(
S32, 25);
7946 auto Width =
B.buildConstant(
S32, 5);
7947 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7948 MI.eraseFromParent();
7956 unsigned Width)
const {
7960 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7961 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7964 MI.eraseFromParent();
7982 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7986 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7989 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7990 MI.eraseFromParent();
8001 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8005 .addReg(Unmerge.getReg(0));
8009 .addReg(Unmerge.getReg(1));
8010 MI.eraseFromParent();
8022 case Intrinsic::sponentry:
8028 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8031 B.buildIntToPtr(DstReg, TmpReg);
8032 MI.eraseFromParent();
8034 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8036 B.buildFrameIndex(
MI.getOperand(0), FI);
8037 MI.eraseFromParent();
8040 case Intrinsic::amdgcn_if:
8041 case Intrinsic::amdgcn_else: {
8044 bool Negated =
false;
8056 std::swap(CondBrTarget, UncondBrTarget);
8058 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8059 if (IntrID == Intrinsic::amdgcn_if) {
8060 B.buildInstr(AMDGPU::SI_IF)
8063 .addMBB(UncondBrTarget);
8065 B.buildInstr(AMDGPU::SI_ELSE)
8068 .addMBB(UncondBrTarget);
8077 B.buildBr(*CondBrTarget);
8082 MI.eraseFromParent();
8083 BrCond->eraseFromParent();
8089 case Intrinsic::amdgcn_loop: {
8092 bool Negated =
false;
8102 std::swap(CondBrTarget, UncondBrTarget);
8104 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8105 B.buildInstr(AMDGPU::SI_LOOP)
8107 .addMBB(UncondBrTarget);
8112 B.buildBr(*CondBrTarget);
8114 MI.eraseFromParent();
8115 BrCond->eraseFromParent();
8122 case Intrinsic::amdgcn_addrspacecast_nonnull:
8124 case Intrinsic::amdgcn_make_buffer_rsrc:
8126 case Intrinsic::amdgcn_kernarg_segment_ptr:
8129 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8130 MI.eraseFromParent();
8136 case Intrinsic::amdgcn_implicitarg_ptr:
8138 case Intrinsic::amdgcn_workitem_id_x:
8141 case Intrinsic::amdgcn_workitem_id_y:
8144 case Intrinsic::amdgcn_workitem_id_z:
8147 case Intrinsic::amdgcn_workgroup_id_x:
8152 case Intrinsic::amdgcn_workgroup_id_y:
8157 case Intrinsic::amdgcn_workgroup_id_z:
8162 case Intrinsic::amdgcn_cluster_id_x:
8163 return ST.hasClusters() &&
8166 case Intrinsic::amdgcn_cluster_id_y:
8167 return ST.hasClusters() &&
8170 case Intrinsic::amdgcn_cluster_id_z:
8171 return ST.hasClusters() &&
8174 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8175 return ST.hasClusters() &&
8178 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8179 return ST.hasClusters() &&
8182 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8183 return ST.hasClusters() &&
8186 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8187 return ST.hasClusters() &&
8189 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8190 return ST.hasClusters() &&
8193 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8194 return ST.hasClusters() &&
8197 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8198 return ST.hasClusters() &&
8201 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8202 return ST.hasClusters() &&
8206 case Intrinsic::amdgcn_wave_id:
8208 case Intrinsic::amdgcn_lds_kernel_id:
8211 case Intrinsic::amdgcn_dispatch_ptr:
8214 case Intrinsic::amdgcn_queue_ptr:
8217 case Intrinsic::amdgcn_implicit_buffer_ptr:
8220 case Intrinsic::amdgcn_dispatch_id:
8223 case Intrinsic::r600_read_ngroups_x:
8227 case Intrinsic::r600_read_ngroups_y:
8230 case Intrinsic::r600_read_ngroups_z:
8233 case Intrinsic::r600_read_local_size_x:
8236 case Intrinsic::r600_read_local_size_y:
8240 case Intrinsic::r600_read_local_size_z:
8243 case Intrinsic::amdgcn_fdiv_fast:
8245 case Intrinsic::amdgcn_is_shared:
8247 case Intrinsic::amdgcn_is_private:
8249 case Intrinsic::amdgcn_wavefrontsize: {
8250 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8251 MI.eraseFromParent();
8254 case Intrinsic::amdgcn_s_buffer_load:
8256 case Intrinsic::amdgcn_raw_buffer_store:
8257 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8258 case Intrinsic::amdgcn_struct_buffer_store:
8259 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8261 case Intrinsic::amdgcn_raw_buffer_store_format:
8262 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8263 case Intrinsic::amdgcn_struct_buffer_store_format:
8264 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8266 case Intrinsic::amdgcn_raw_tbuffer_store:
8267 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8268 case Intrinsic::amdgcn_struct_tbuffer_store:
8269 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8271 case Intrinsic::amdgcn_raw_buffer_load:
8272 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8273 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8274 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8275 case Intrinsic::amdgcn_struct_buffer_load:
8276 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8277 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8278 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8280 case Intrinsic::amdgcn_raw_buffer_load_format:
8281 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8282 case Intrinsic::amdgcn_struct_buffer_load_format:
8283 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8285 case Intrinsic::amdgcn_raw_tbuffer_load:
8286 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8287 case Intrinsic::amdgcn_struct_tbuffer_load:
8288 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8290 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8291 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8292 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8293 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8294 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8295 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8296 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8297 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8298 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8299 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8300 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8301 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8302 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8303 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8304 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8305 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8306 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8308 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8309 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8310 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8311 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8312 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8313 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8314 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8315 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8316 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8317 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8318 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8319 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8320 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8321 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8322 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8323 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8324 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8325 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8326 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8327 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8328 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8329 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8330 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8331 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8332 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8333 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8334 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8335 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8336 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8337 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8338 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8339 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8340 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8341 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8342 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8343 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8344 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8345 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8346 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8347 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8348 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8349 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8350 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8351 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8352 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8353 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8354 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8355 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8356 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8357 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8358 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8359 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8360 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8361 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8363 case Intrinsic::amdgcn_rsq_clamp:
8365 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8367 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8368 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8370 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8371 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8372 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8373 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8374 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8375 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8376 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8377 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8381 if (IndexArgTy !=
S64) {
8382 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8383 :
B.buildAnyExt(
S64, Index);
8384 MI.getOperand(5).setReg(NewIndex.getReg(0));
8388 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8389 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8390 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8391 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8392 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8393 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8394 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8395 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8399 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8402 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8403 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8404 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8405 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8406 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8407 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8408 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8409 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8410 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8412 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8416 if (IndexArgTy != IdxTy) {
8417 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8418 :
B.buildAnyExt(IdxTy, Index);
8419 MI.getOperand(7).setReg(NewIndex.getReg(0));
8424 case Intrinsic::amdgcn_fmed3: {
8430 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8431 MI.removeOperand(1);
8435 case Intrinsic::amdgcn_readlane:
8436 case Intrinsic::amdgcn_writelane:
8437 case Intrinsic::amdgcn_readfirstlane:
8438 case Intrinsic::amdgcn_permlane16:
8439 case Intrinsic::amdgcn_permlanex16:
8440 case Intrinsic::amdgcn_permlane64:
8441 case Intrinsic::amdgcn_set_inactive:
8442 case Intrinsic::amdgcn_set_inactive_chain_arg:
8443 case Intrinsic::amdgcn_mov_dpp8:
8444 case Intrinsic::amdgcn_update_dpp:
8446 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8448 case Intrinsic::amdgcn_dead: {
8452 MI.eraseFromParent();
8455 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8456 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8457 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8458 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8459 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8460 MI.eraseFromParent();
8462 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8463 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8464 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8465 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8466 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8467 MI.eraseFromParent();
8469 case Intrinsic::amdgcn_flat_load_monitor_b32:
8470 case Intrinsic::amdgcn_flat_load_monitor_b64:
8471 case Intrinsic::amdgcn_flat_load_monitor_b128:
8472 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8473 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8474 .add(
MI.getOperand(0))
8475 .add(
MI.getOperand(2))
8476 .addMemOperand(*
MI.memoperands_begin());
8477 MI.eraseFromParent();
8479 case Intrinsic::amdgcn_global_load_monitor_b32:
8480 case Intrinsic::amdgcn_global_load_monitor_b64:
8481 case Intrinsic::amdgcn_global_load_monitor_b128:
8482 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8483 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8484 .add(
MI.getOperand(0))
8485 .add(
MI.getOperand(2))
8486 .addMemOperand(*
MI.memoperands_begin());
8487 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.