34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
362 const LLT Ty = Query.Types[TypeIdx];
364 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
372 bool IsLoad,
bool IsAtomic) {
376 return ST.enableFlatScratch() ? 128 : 32;
378 return ST.useDS128() ? 128 : 64;
389 return IsLoad ? 512 : 128;
394 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
403 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
408 unsigned AS = Query.
Types[1].getAddressSpace();
422 if (IsLoad && MemSize <
Size)
423 MemSize = std::max(MemSize,
Align);
432 AtomicOrdering::NotAtomic))
443 if (!ST.hasDwordx3LoadStores())
456 if (AlignBits < MemSize) {
459 Align(AlignBits / 8)))
502 return EltSize != 32 && EltSize != 64;
517 if (
Size != MemSizeInBits)
533 uint64_t AlignInBits,
unsigned AddrSpace,
543 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
554 if (AlignInBits < RoundedSize)
561 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
568 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
573 Query.
Types[1].getAddressSpace(), Opcode);
593 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
596 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
597 std::array<Register, 4> VectorElems;
598 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
599 for (
unsigned I = 0;
I < NumParts; ++
I)
601 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
602 B.buildMergeValues(MO, VectorElems);
606 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
607 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
608 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
609 B.buildIntToPtr(MO, Scalar);
629 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
630 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
631 for (
unsigned I = 0;
I < NumParts; ++
I)
633 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
635 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
636 return B.buildBitcast(VectorTy, Scalar).getReg(0);
653 using namespace TargetOpcode;
655 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
668 const LLT BufferStridedPtr =
671 const LLT CodePtr = FlatPtr;
673 const std::initializer_list<LLT> AddrSpaces64 = {
674 GlobalPtr, ConstantPtr, FlatPtr
677 const std::initializer_list<LLT> AddrSpaces32 = {
678 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
681 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
683 const std::initializer_list<LLT> FPTypesBase = {
687 const std::initializer_list<LLT> FPTypes16 = {
691 const std::initializer_list<LLT> FPTypesPK16 = {
721 .clampMaxNumElementsStrict(0,
S16, 2)
729 .clampMaxNumElementsStrict(0,
S16, 2)
739 .clampMaxNumElementsStrict(0,
S16, 2)
747 .clampMaxNumElementsStrict(0,
S16, 2)
757 .minScalarOrElt(0,
S16)
774 .widenScalarToNextMultipleOf(0, 32)
796 .widenScalarToNextMultipleOf(0, 32)
804 .widenScalarToNextMultipleOf(0, 32);
815 .minScalarOrElt(0,
S32)
834 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
846 .clampMaxNumElements(0,
S8, 2)
865 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
877 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
884 .clampScalar(0,
S16,
S64);
916 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
917 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
940 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
974 .legalFor(FPTypesPK16)
988 .clampScalar(0,
S16,
S64);
1013 .clampScalar(0,
S32,
S64);
1018 .clampScalar(0,
S32,
S64);
1024 .clampScalar(0,
S32,
S64)
1025 .clampScalar(1,
S32,
S32)
1032 .clampScalar(1,
S32,
S32)
1068 FMad.customFor({
S32,
S16});
1070 FMad.customFor({
S32});
1072 FMad.customFor({
S16});
1080 FRem.minScalar(0,
S32)
1089 .clampMaxNumElements(0,
S16, 2)
1100 .clampScalar(0,
S32,
S64)
1101 .widenScalarToNextPow2(1, 32);
1129 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1135 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1139 if (
ST.has16BitInsts()) {
1140 getActionDefinitionsBuilder(
1141 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1143 .clampScalar(0,
S16,
S64)
1146 getActionDefinitionsBuilder(
1147 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1149 .clampScalar(0,
S32,
S64)
1152 getActionDefinitionsBuilder(
1153 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1156 .clampScalar(0,
S32,
S64)
1160 getActionDefinitionsBuilder(G_PTR_ADD)
1161 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1164 .scalarSameSizeAs(1, 0);
1166 getActionDefinitionsBuilder(G_PTRMASK)
1168 .scalarSameSizeAs(1, 0)
1172 getActionDefinitionsBuilder(G_ICMP)
1183 .legalForCartesianProduct(
1184 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1185 .legalForCartesianProduct(
1186 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1187 if (
ST.has16BitInsts()) {
1188 CmpBuilder.legalFor({{
S1,
S16}});
1192 .widenScalarToNextPow2(1)
1193 .clampScalar(1,
S32,
S64)
1198 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1199 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1201 if (
ST.hasSALUFloatInsts())
1202 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1205 .widenScalarToNextPow2(1)
1206 .clampScalar(1,
S32,
S64)
1210 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1211 if (
ST.has16BitInsts())
1212 ExpOps.customFor({{
S32}, {
S16}});
1214 ExpOps.customFor({
S32});
1215 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1218 getActionDefinitionsBuilder(G_FPOWI)
1219 .clampScalar(0, MinScalarFPTy,
S32)
1222 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1223 Log2Ops.customFor({
S32});
1224 if (
ST.has16BitInsts())
1225 Log2Ops.legalFor({
S16});
1227 Log2Ops.customFor({
S16});
1228 Log2Ops.scalarize(0)
1232 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1233 LogOps.customFor({
S32,
S16});
1234 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1238 getActionDefinitionsBuilder(G_CTPOP)
1240 .clampScalar(0,
S32,
S32)
1241 .widenScalarToNextPow2(1, 32)
1242 .clampScalar(1,
S32,
S64)
1244 .widenScalarToNextPow2(0, 32);
1247 if (
ST.has16BitInsts())
1248 getActionDefinitionsBuilder(G_IS_FPCLASS)
1249 .legalForCartesianProduct({
S1}, FPTypes16)
1250 .widenScalarToNextPow2(1)
1254 getActionDefinitionsBuilder(G_IS_FPCLASS)
1255 .legalForCartesianProduct({
S1}, FPTypesBase)
1256 .lowerFor({
S1,
S16})
1257 .widenScalarToNextPow2(1)
1264 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1266 .clampScalar(0,
S32,
S32)
1267 .clampScalar(1,
S32,
S64)
1268 .widenScalarToNextPow2(0, 32)
1269 .widenScalarToNextPow2(1, 32)
1273 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1275 .clampScalar(0,
S32,
S32)
1276 .clampScalar(1,
S32,
S64)
1278 .widenScalarToNextPow2(0, 32)
1279 .widenScalarToNextPow2(1, 32);
1283 getActionDefinitionsBuilder(G_BITREVERSE)
1285 .clampScalar(0,
S32,
S64)
1287 .widenScalarToNextPow2(0);
1289 if (
ST.has16BitInsts()) {
1290 getActionDefinitionsBuilder(G_BSWAP)
1292 .clampMaxNumElementsStrict(0,
S16, 2)
1295 .widenScalarToNextPow2(0)
1296 .clampScalar(0,
S16,
S32)
1299 if (
ST.hasVOP3PInsts()) {
1300 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1302 .clampMaxNumElements(0,
S16, 2)
1304 .widenScalarToNextPow2(0)
1308 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1310 .widenScalarToNextPow2(0)
1317 getActionDefinitionsBuilder(G_BSWAP)
1322 .widenScalarToNextPow2(0)
1327 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1330 .widenScalarToNextPow2(0)
1335 getActionDefinitionsBuilder(G_INTTOPTR)
1337 .legalForCartesianProduct(AddrSpaces64, {
S64})
1338 .legalForCartesianProduct(AddrSpaces32, {
S32})
1351 getActionDefinitionsBuilder(G_PTRTOINT)
1353 .legalForCartesianProduct(AddrSpaces64, {
S64})
1354 .legalForCartesianProduct(AddrSpaces32, {
S32})
1367 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1371 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1372 bool IsLoad) ->
bool {
1376 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1390 unsigned NumRegs = (MemSize + 31) / 32;
1392 if (!
ST.hasDwordx3LoadStores())
1403 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1404 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1405 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1411 for (
unsigned Op : {G_LOAD, G_STORE}) {
1412 const bool IsStore =
Op == G_STORE;
1414 auto &Actions = getActionDefinitionsBuilder(
Op);
1417 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1420 {
S64, GlobalPtr,
S64, GlobalAlign32},
1423 {
S32, GlobalPtr,
S8, GlobalAlign8},
1424 {
S32, GlobalPtr,
S16, GlobalAlign16},
1426 {
S32, LocalPtr,
S32, 32},
1427 {
S64, LocalPtr,
S64, 32},
1429 {
S32, LocalPtr,
S8, 8},
1430 {
S32, LocalPtr,
S16, 16},
1433 {
S32, PrivatePtr,
S32, 32},
1434 {
S32, PrivatePtr,
S8, 8},
1435 {
S32, PrivatePtr,
S16, 16},
1438 {
S32, ConstantPtr,
S32, GlobalAlign32},
1441 {
S64, ConstantPtr,
S64, GlobalAlign32},
1442 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1451 Actions.unsupportedIf(
1452 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1466 Actions.customIf(
typeIs(1, Constant32Ptr));
1492 return !Query.
Types[0].isVector() &&
1493 needToSplitMemOp(Query,
Op == G_LOAD);
1495 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1500 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1503 if (DstSize > MemSize)
1509 if (MemSize > MaxSize)
1517 return Query.
Types[0].isVector() &&
1518 needToSplitMemOp(Query,
Op == G_LOAD);
1520 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1534 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1535 if (MemSize > MaxSize) {
1539 if (MaxSize % EltSize == 0) {
1545 unsigned NumPieces = MemSize / MaxSize;
1549 if (NumPieces == 1 || NumPieces >= NumElts ||
1550 NumElts % NumPieces != 0)
1551 return std::pair(0, EltTy);
1559 return std::pair(0, EltTy);
1574 return std::pair(0, EltTy);
1578 .widenScalarToNextPow2(0)
1584 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1585 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1586 {
S32, GlobalPtr,
S16, 2 * 8},
1587 {
S32, LocalPtr,
S8, 8},
1588 {
S32, LocalPtr,
S16, 16},
1589 {
S32, PrivatePtr,
S8, 8},
1590 {
S32, PrivatePtr,
S16, 16},
1591 {
S32, ConstantPtr,
S8, 8},
1592 {
S32, ConstantPtr,
S16, 2 * 8}})
1598 if (
ST.hasFlatAddressSpace()) {
1599 ExtLoads.legalForTypesWithMemDesc(
1600 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1608 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1610 ExtLoads.clampScalar(0,
S32,
S32)
1611 .widenScalarToNextPow2(0)
1614 auto &Atomics = getActionDefinitionsBuilder(
1615 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1616 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1617 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1618 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1619 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1620 {
S64, GlobalPtr}, {
S64, LocalPtr},
1621 {
S32, RegionPtr}, {
S64, RegionPtr}});
1622 if (
ST.hasFlatAddressSpace()) {
1623 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1626 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1627 if (
ST.hasLDSFPAtomicAdd()) {
1628 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1629 if (
ST.hasLdsAtomicAddF64())
1630 Atomic.legalFor({{
S64, LocalPtr}});
1631 if (
ST.hasAtomicDsPkAdd16Insts())
1632 Atomic.legalFor({{
V2S16, LocalPtr}});
1634 if (
ST.hasAtomicFaddInsts())
1635 Atomic.legalFor({{
S32, GlobalPtr}});
1636 if (
ST.hasFlatAtomicFaddF32Inst())
1637 Atomic.legalFor({{
S32, FlatPtr}});
1639 if (
ST.hasGFX90AInsts()) {
1652 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1653 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1654 {
S32, FlatPtr}, {
S64, FlatPtr}})
1655 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1656 {
S32, RegionPtr}, {
S64, RegionPtr}});
1660 getActionDefinitionsBuilder(G_SELECT)
1662 LocalPtr, FlatPtr, PrivatePtr,
1666 .clampScalar(0,
S16,
S64)
1670 .clampMaxNumElements(0,
S32, 2)
1671 .clampMaxNumElements(0, LocalPtr, 2)
1672 .clampMaxNumElements(0, PrivatePtr, 2)
1674 .widenScalarToNextPow2(0)
1679 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1681 if (
ST.has16BitInsts()) {
1682 if (
ST.hasVOP3PInsts()) {
1684 .clampMaxNumElements(0,
S16, 2);
1686 Shifts.legalFor({{
S16,
S16}});
1689 Shifts.widenScalarIf(
1694 const LLT AmountTy = Query.
Types[1];
1699 Shifts.clampScalar(1,
S32,
S32);
1700 Shifts.widenScalarToNextPow2(0, 16);
1701 Shifts.clampScalar(0,
S16,
S64);
1703 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1711 Shifts.clampScalar(1,
S32,
S32);
1712 Shifts.widenScalarToNextPow2(0, 32);
1713 Shifts.clampScalar(0,
S32,
S64);
1715 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1720 Shifts.scalarize(0);
1722 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1723 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1724 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1725 unsigned IdxTypeIdx = 2;
1727 getActionDefinitionsBuilder(
Op)
1729 const LLT EltTy = Query.
Types[EltTypeIdx];
1730 const LLT VecTy = Query.
Types[VecTypeIdx];
1731 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1733 const bool isLegalVecType =
1743 return (EltSize == 32 || EltSize == 64) &&
1758 const LLT EltTy = Query.
Types[EltTypeIdx];
1759 const LLT VecTy = Query.
Types[VecTypeIdx];
1763 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1768 .clampScalar(EltTypeIdx,
S32,
S64)
1769 .clampScalar(VecTypeIdx,
S32,
S64)
1770 .clampScalar(IdxTypeIdx,
S32,
S32)
1771 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1781 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1783 const LLT &EltTy = Query.
Types[1].getElementType();
1784 return Query.
Types[0] != EltTy;
1787 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1788 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1789 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1792 getActionDefinitionsBuilder(
Op)
1798 const LLT BigTy = Query.
Types[BigTyIdx];
1803 const LLT BigTy = Query.
Types[BigTyIdx];
1804 const LLT LitTy = Query.
Types[LitTyIdx];
1810 const LLT BigTy = Query.
Types[BigTyIdx];
1816 const LLT LitTy = Query.
Types[LitTyIdx];
1821 .widenScalarToNextPow2(BigTyIdx, 32);
1825 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1835 if (
ST.hasScalarPackInsts()) {
1838 .minScalarOrElt(0,
S16)
1841 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1845 BuildVector.customFor({
V2S16,
S16});
1846 BuildVector.minScalarOrElt(0,
S32);
1848 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1856 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1858 .clampMaxNumElements(0,
S32, 32)
1859 .clampMaxNumElements(1,
S16, 2)
1860 .clampMaxNumElements(0,
S16, 64);
1862 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1865 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1866 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1867 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1869 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1870 const LLT Ty = Query.
Types[TypeIdx];
1881 auto &Builder = getActionDefinitionsBuilder(
Op)
1885 const LLT BigTy = Query.
Types[BigTyIdx];
1891 .widenScalarToNextPow2(LitTyIdx, 16)
1899 .clampScalar(LitTyIdx,
S32,
S512)
1900 .widenScalarToNextPow2(LitTyIdx, 32)
1903 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1906 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1910 if (
Op == G_MERGE_VALUES) {
1911 Builder.widenScalarIf(
1914 const LLT Ty = Query.
Types[LitTyIdx];
1920 Builder.widenScalarIf(
1922 const LLT Ty = Query.
Types[BigTyIdx];
1928 const LLT &Ty = Query.
Types[BigTyIdx];
1930 if (NewSizeInBits >= 256) {
1932 if (RoundedTo < NewSizeInBits)
1933 NewSizeInBits = RoundedTo;
1935 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1944 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1945 .legalFor({{
S32}, {
S64}});
1947 if (
ST.hasVOP3PInsts()) {
1948 SextInReg.lowerFor({{
V2S16}})
1952 .clampMaxNumElementsStrict(0,
S16, 2);
1953 }
else if (
ST.has16BitInsts()) {
1954 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
1958 SextInReg.lowerFor({{
S32}, {
S64}});
1963 .clampScalar(0,
S32,
S64)
1966 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1971 getActionDefinitionsBuilder(G_FSHR)
1974 .clampMaxNumElementsStrict(0,
S16, 2)
1978 if (
ST.hasVOP3PInsts()) {
1979 getActionDefinitionsBuilder(G_FSHL)
1981 .clampMaxNumElementsStrict(0,
S16, 2)
1985 getActionDefinitionsBuilder(G_FSHL)
1990 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1993 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
1995 getActionDefinitionsBuilder(G_FENCE)
1998 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2003 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2005 .clampScalar(1,
S32,
S32)
2006 .clampScalar(0,
S32,
S64)
2007 .widenScalarToNextPow2(0)
2010 getActionDefinitionsBuilder(
2014 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2015 G_READ_REGISTER, G_WRITE_REGISTER,
2020 if (
ST.hasIEEEMinMax()) {
2021 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2022 .legalFor(FPTypesPK16)
2023 .clampMaxNumElements(0,
S16, 2)
2027 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2030 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2033 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2035 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2036 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2037 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2040 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2042 getLegacyLegalizerInfo().computeTables();
2052 switch (
MI.getOpcode()) {
2053 case TargetOpcode::G_ADDRSPACE_CAST:
2055 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2057 case TargetOpcode::G_FCEIL:
2059 case TargetOpcode::G_FREM:
2061 case TargetOpcode::G_INTRINSIC_TRUNC:
2063 case TargetOpcode::G_SITOFP:
2065 case TargetOpcode::G_UITOFP:
2067 case TargetOpcode::G_FPTOSI:
2069 case TargetOpcode::G_FPTOUI:
2071 case TargetOpcode::G_FMINNUM:
2072 case TargetOpcode::G_FMAXNUM:
2073 case TargetOpcode::G_FMINNUM_IEEE:
2074 case TargetOpcode::G_FMAXNUM_IEEE:
2076 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2078 case TargetOpcode::G_INSERT_VECTOR_ELT:
2080 case TargetOpcode::G_FSIN:
2081 case TargetOpcode::G_FCOS:
2083 case TargetOpcode::G_GLOBAL_VALUE:
2085 case TargetOpcode::G_LOAD:
2086 case TargetOpcode::G_SEXTLOAD:
2087 case TargetOpcode::G_ZEXTLOAD:
2089 case TargetOpcode::G_STORE:
2091 case TargetOpcode::G_FMAD:
2093 case TargetOpcode::G_FDIV:
2095 case TargetOpcode::G_FFREXP:
2097 case TargetOpcode::G_FSQRT:
2099 case TargetOpcode::G_UDIV:
2100 case TargetOpcode::G_UREM:
2101 case TargetOpcode::G_UDIVREM:
2103 case TargetOpcode::G_SDIV:
2104 case TargetOpcode::G_SREM:
2105 case TargetOpcode::G_SDIVREM:
2107 case TargetOpcode::G_ATOMIC_CMPXCHG:
2109 case TargetOpcode::G_FLOG2:
2111 case TargetOpcode::G_FLOG:
2112 case TargetOpcode::G_FLOG10:
2114 case TargetOpcode::G_FEXP2:
2116 case TargetOpcode::G_FEXP:
2117 case TargetOpcode::G_FEXP10:
2119 case TargetOpcode::G_FPOW:
2121 case TargetOpcode::G_FFLOOR:
2123 case TargetOpcode::G_BUILD_VECTOR:
2124 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2126 case TargetOpcode::G_MUL:
2128 case TargetOpcode::G_CTLZ:
2129 case TargetOpcode::G_CTTZ:
2131 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2133 case TargetOpcode::G_STACKSAVE:
2135 case TargetOpcode::G_GET_FPENV:
2137 case TargetOpcode::G_SET_FPENV:
2139 case TargetOpcode::G_TRAP:
2141 case TargetOpcode::G_DEBUGTRAP:
2161 if (ST.hasApertureRegs()) {
2166 ? AMDGPU::SRC_SHARED_BASE
2167 : AMDGPU::SRC_PRIVATE_BASE;
2176 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2177 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2178 return B.buildUnmerge(
S32, Dst).getReg(1);
2183 Register LoadAddr =
MRI.createGenericVirtualRegister(
2193 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2195 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2209 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2212 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2215 Register QueuePtr =
MRI.createGenericVirtualRegister(
2231 B.buildPtrAdd(LoadAddr, QueuePtr,
2232 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2233 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2241 switch (Def->getOpcode()) {
2242 case AMDGPU::G_FRAME_INDEX:
2243 case AMDGPU::G_GLOBAL_VALUE:
2244 case AMDGPU::G_BLOCK_ADDR:
2246 case AMDGPU::G_CONSTANT: {
2247 const ConstantInt *CI = Def->getOperand(1).getCImm();
2264 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2265 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2266 Intrinsic::amdgcn_addrspacecast_nonnull));
2270 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2271 :
MI.getOperand(1).getReg();
2272 LLT DstTy =
MRI.getType(Dst);
2273 LLT SrcTy =
MRI.getType(Src);
2284 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2285 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2296 B.buildExtract(Dst, Src, 0);
2297 MI.eraseFromParent();
2301 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2303 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2304 auto FlatNull =
B.buildConstant(SrcTy, 0);
2307 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2311 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2313 MI.eraseFromParent();
2325 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2329 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2334 B.buildCopy(Dst, BuildPtr);
2335 MI.eraseFromParent();
2339 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2340 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2343 SegmentNull.getReg(0));
2345 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2347 MI.eraseFromParent();
2354 B.buildExtract(Dst, Src, 0);
2355 MI.eraseFromParent();
2363 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2364 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2365 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2366 MI.eraseFromParent();
2371 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2374 Ctx.
diagnose(InvalidAddrSpaceCast);
2376 MI.eraseFromParent();
2384 LLT Ty =
MRI.getType(Src);
2390 auto C1 =
B.buildFConstant(Ty, C1Val);
2391 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2394 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2395 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2397 auto C2 =
B.buildFConstant(Ty, C2Val);
2398 auto Fabs =
B.buildFAbs(Ty, Src);
2401 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2402 MI.eraseFromParent();
2420 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2422 const auto Zero =
B.buildFConstant(
S64, 0.0);
2423 const auto One =
B.buildFConstant(
S64, 1.0);
2426 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2427 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2430 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2431 MI.eraseFromParent();
2439 Register Src0Reg =
MI.getOperand(1).getReg();
2440 Register Src1Reg =
MI.getOperand(2).getReg();
2441 auto Flags =
MI.getFlags();
2442 LLT Ty =
MRI.getType(DstReg);
2444 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2445 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2446 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2447 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2448 MI.eraseFromParent();
2454 const unsigned FractBits = 52;
2455 const unsigned ExpBits = 11;
2458 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2459 auto Const1 =
B.buildConstant(
S32, ExpBits);
2461 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2463 .addUse(Const0.getReg(0))
2464 .addUse(Const1.getReg(0));
2466 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2480 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2487 const unsigned FractBits = 52;
2490 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2491 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2493 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2495 const auto Zero32 =
B.buildConstant(
S32, 0);
2498 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2500 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2501 auto Not =
B.buildNot(
S64, Shr);
2502 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2503 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2508 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2509 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2510 MI.eraseFromParent();
2526 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2527 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2529 if (
MRI.getType(Dst) ==
S64) {
2530 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2531 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2533 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2534 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2537 B.buildFAdd(Dst, LdExp, CvtLo);
2538 MI.eraseFromParent();
2544 auto One =
B.buildConstant(
S32, 1);
2548 auto ThirtyOne =
B.buildConstant(
S32, 31);
2549 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2550 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2551 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2552 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2553 .addUse(Unmerge.getReg(1));
2554 auto LS2 =
B.buildSub(
S32, LS, One);
2555 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2557 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2558 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2559 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2560 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2561 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2562 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2563 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2564 B.buildFLdexp(Dst, FVal, Scale);
2565 MI.eraseFromParent();
2582 const LLT SrcLT =
MRI.getType(Src);
2585 unsigned Flags =
MI.getFlags();
2596 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2604 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2605 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2609 K0 =
B.buildFConstant(
2610 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2611 K1 =
B.buildFConstant(
2612 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2614 K0 =
B.buildFConstant(
2615 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2616 K1 =
B.buildFConstant(
2617 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2620 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2621 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2622 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2625 :
B.buildFPTOUI(
S32, FloorMul);
2626 auto Lo =
B.buildFPTOUI(
S32, Fma);
2630 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2632 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2635 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2636 MI.eraseFromParent();
2646 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2647 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2670 LLT VecTy =
MRI.getType(Vec);
2683 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2684 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2685 B.buildIntToPtr(Dst, IntElt);
2687 MI.eraseFromParent();
2694 std::optional<ValueAndVReg> MaybeIdxVal =
2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2701 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2702 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2707 MI.eraseFromParent();
2722 LLT VecTy =
MRI.getType(Vec);
2736 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2737 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2738 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2740 B.buildIntToPtr(Dst, IntVecDest);
2741 MI.eraseFromParent();
2748 std::optional<ValueAndVReg> MaybeIdxVal =
2753 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2756 if (IdxVal < NumElts) {
2758 for (
unsigned i = 0; i < NumElts; ++i)
2759 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2760 B.buildUnmerge(SrcRegs, Vec);
2762 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2763 B.buildMergeLikeInstr(Dst, SrcRegs);
2768 MI.eraseFromParent();
2778 LLT Ty =
MRI.getType(DstReg);
2779 unsigned Flags =
MI.getFlags();
2784 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2785 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2786 .addUse(MulVal.getReg(0))
2790 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2793 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2797 MI.eraseFromParent();
2805 unsigned GAFlags)
const {
2806 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2834 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2845 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2846 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2849 B.buildExtract(DstReg, PCReg, 0);
2863 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2865 :
MRI.createGenericVirtualRegister(
S32);
2867 if (!
MRI.getRegClassOrNull(AddrLo))
2868 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2871 B.buildInstr(AMDGPU::S_MOV_B32)
2876 if (RequiresHighHalf) {
2878 "Must provide a 64-bit pointer type!");
2881 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2883 B.buildInstr(AMDGPU::S_MOV_B32)
2893 if (!
MRI.getRegClassOrNull(AddrDst))
2894 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2896 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2900 if (AddrDst != DstReg)
2901 B.buildCast(DstReg, AddrDst);
2902 }
else if (AddrLo != DstReg) {
2905 B.buildCast(DstReg, AddrLo);
2913 LLT Ty =
MRI.getType(DstReg);
2925 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2935 B.buildUndef(DstReg);
2936 MI.eraseFromParent();
2956 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2960 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
2961 B.buildIntToPtr(DstReg, Sz);
2962 MI.eraseFromParent();
2968 *cast<GlobalVariable>(GV)));
2969 MI.eraseFromParent();
2975 MI.eraseFromParent();
2983 MI.eraseFromParent();
2989 MI.eraseFromParent();
2994 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3007 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3008 B.buildExtract(DstReg, Load, 0);
3010 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3012 MI.eraseFromParent();
3030 LLT PtrTy =
MRI.getType(PtrReg);
3035 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3037 MI.getOperand(1).setReg(Cast.getReg(0));
3042 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3046 LLT ValTy =
MRI.getType(ValReg);
3068 if (WideMemSize == ValSize) {
3074 MI.setMemRefs(MF, {WideMMO});
3080 if (ValSize > WideMemSize)
3087 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3088 B.buildTrunc(ValReg, WideLoad).getReg(0);
3095 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3096 B.buildExtract(ValReg, WideLoad, 0);
3100 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3101 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3105 MI.eraseFromParent();
3118 Register DataReg =
MI.getOperand(0).getReg();
3119 LLT DataTy =
MRI.getType(DataReg);
3133 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3162 "this should not have been custom lowered");
3164 LLT ValTy =
MRI.getType(CmpVal);
3167 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3169 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3173 .setMemRefs(
MI.memoperands());
3175 MI.eraseFromParent();
3184 case TargetOpcode::G_INTRINSIC: {
3186 case Intrinsic::amdgcn_frexp_mant:
3194 case TargetOpcode::G_FFREXP: {
3199 case TargetOpcode::G_FPEXT: {
3223std::pair<Register, Register>
3225 unsigned Flags)
const {
3230 auto SmallestNormal =
B.buildFConstant(
3232 auto IsLtSmallestNormal =
3235 auto Scale32 =
B.buildFConstant(F32, 0x1.0p+32);
3236 auto One =
B.buildFConstant(F32, 1.0);
3238 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3239 auto ScaledInput =
B.buildFMul(F32, Src, ScaleFactor, Flags);
3241 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3254 LLT Ty =
B.getMRI()->getType(Dst);
3255 unsigned Flags =
MI.getFlags();
3260 auto Ext =
B.buildFPExt(F32, Src, Flags);
3261 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3262 .addUse(Ext.getReg(0))
3264 B.buildFPTrunc(Dst,
Log2, Flags);
3265 MI.eraseFromParent();
3273 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3276 MI.eraseFromParent();
3280 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3281 .addUse(ScaledInput)
3284 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3285 auto Zero =
B.buildFConstant(Ty, 0.0);
3287 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3288 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3290 MI.eraseFromParent();
3296 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3297 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3302 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3303 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3308 unsigned Flags =
MI.getFlags();
3309 const LLT Ty =
MRI.getType(
X);
3319 TM.Options.ApproxFuncFPMath ||
TM.Options.UnsafeFPMath) {
3321 Register LogVal =
MRI.createGenericVirtualRegister(F32);
3322 auto PromoteSrc =
B.buildFPExt(F32,
X);
3324 B.buildFPTrunc(Dst, LogVal);
3329 MI.eraseFromParent();
3338 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3343 const float c_log10 = 0x1.344134p-2f;
3344 const float cc_log10 = 0x1.09f79ep-26f;
3347 const float c_log = 0x1.62e42ep-1f;
3348 const float cc_log = 0x1.efa39ep-25f;
3350 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3351 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3353 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3354 auto NegR =
B.buildFNeg(Ty, R, Flags);
3355 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3356 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3357 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3360 const float ch_log10 = 0x1.344000p-2f;
3361 const float ct_log10 = 0x1.3509f6p-18f;
3364 const float ch_log = 0x1.62e000p-1f;
3365 const float ct_log = 0x1.0bfbe8p-15f;
3367 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3368 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3370 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3371 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3372 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3373 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3376 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3378 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3381 const bool IsFiniteOnly =
3385 if (!IsFiniteOnly) {
3388 auto Fabs =
B.buildFAbs(Ty,
Y);
3391 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3395 auto Zero =
B.buildFConstant(Ty, 0.0);
3397 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3398 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3399 B.buildFSub(Dst, R, Shift, Flags);
3401 B.buildCopy(Dst, R);
3404 MI.eraseFromParent();
3410 unsigned Flags)
const {
3411 const double Log2BaseInverted =
3414 LLT Ty =
B.getMRI()->getType(Dst);
3419 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3422 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3423 auto Zero =
B.buildFConstant(Ty, 0.0);
3425 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3426 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3429 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3431 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3432 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3440 ?
B.buildFLog2(Ty, Src, Flags)
3441 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3444 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3445 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3456 unsigned Flags =
MI.getFlags();
3457 LLT Ty =
B.getMRI()->getType(Dst);
3463 auto Ext =
B.buildFPExt(F32, Src, Flags);
3464 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3465 .addUse(Ext.getReg(0))
3467 B.buildFPTrunc(Dst,
Log2, Flags);
3468 MI.eraseFromParent();
3478 MI.eraseFromParent();
3486 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3488 RangeCheckConst, Flags);
3490 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3491 auto Zero =
B.buildFConstant(Ty, 0.0);
3492 auto AddOffset =
B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3493 auto AddInput =
B.buildFAdd(F32, Src, AddOffset, Flags);
3495 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3496 .addUse(AddInput.getReg(0))
3499 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3500 auto One =
B.buildFConstant(Ty, 1.0);
3501 auto ResultScale =
B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3502 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3503 MI.eraseFromParent();
3509 LLT Ty =
B.getMRI()->getType(Dst);
3514 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3518 .addUse(
Mul.getReg(0))
3521 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3527 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3530 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3531 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3532 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3535 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3537 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3538 .addUse(ExpInput.getReg(0))
3541 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3542 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3543 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3551 const unsigned Flags =
MI.getFlags();
3554 LLT Ty =
MRI.getType(Dst);
3557 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3564 MI.eraseFromParent();
3572 auto Ext =
B.buildFPExt(F32,
X, Flags);
3573 Register Lowered =
MRI.createGenericVirtualRegister(F32);
3575 B.buildFPTrunc(Dst, Lowered, Flags);
3576 MI.eraseFromParent();
3586 MI.eraseFromParent();
3614 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3619 const float cc_exp = 0x1.4ae0bep-26f;
3620 const float c_exp10 = 0x1.a934f0p+1f;
3621 const float cc_exp10 = 0x1.2f346ep-24f;
3623 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3624 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3625 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3626 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3628 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3629 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3631 const float ch_exp = 0x1.714000p+0f;
3632 const float cl_exp = 0x1.47652ap-12f;
3634 const float ch_exp10 = 0x1.a92000p+1f;
3635 const float cl_exp10 = 0x1.4f0978p-11f;
3637 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3638 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3639 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3641 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3642 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3644 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3645 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3648 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3649 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3652 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3655 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3656 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3659 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3660 .addUse(
A.getReg(0))
3662 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3664 auto UnderflowCheckConst =
3665 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3666 auto Zero =
B.buildFConstant(Ty, 0.0);
3670 R =
B.buildSelect(Ty, Underflow, Zero, R);
3675 auto OverflowCheckConst =
3676 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3681 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3684 B.buildCopy(Dst, R);
3685 MI.eraseFromParent();
3694 unsigned Flags =
MI.getFlags();
3695 LLT Ty =
B.getMRI()->getType(Dst);
3700 auto Log =
B.buildFLog2(F32, Src0, Flags);
3701 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3702 .addUse(Log.getReg(0))
3705 B.buildFExp2(Dst,
Mul, Flags);
3706 }
else if (Ty == F16) {
3708 auto Log =
B.buildFLog2(F16, Src0, Flags);
3709 auto Ext0 =
B.buildFPExt(F32, Log, Flags);
3710 auto Ext1 =
B.buildFPExt(F32, Src1, Flags);
3711 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3712 .addUse(Ext0.getReg(0))
3713 .addUse(Ext1.getReg(0))
3715 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3719 MI.eraseFromParent();
3727 ModSrc = SrcFNeg->getOperand(1).getReg();
3729 ModSrc = SrcFAbs->getOperand(1).getReg();
3731 ModSrc = SrcFAbs->getOperand(1).getReg();
3742 Register OrigSrc =
MI.getOperand(1).getReg();
3743 unsigned Flags =
MI.getFlags();
3745 "this should not have been custom lowered");
3755 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3767 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3769 Register Min =
MRI.createGenericVirtualRegister(F64);
3775 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3777 B.buildFMinNum(Min, Fract, Const, Flags);
3782 CorrectedFract =
B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3785 auto NegFract =
B.buildFNeg(F64, CorrectedFract, Flags);
3786 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3788 MI.eraseFromParent();
3804 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3806 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3807 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3810 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3811 B.buildBitcast(Dst,
Merge);
3813 MI.eraseFromParent();
3830 bool UsePartialMad64_32,
3831 bool SeparateOddAlignedProducts)
const {
3846 auto getZero32 = [&]() ->
Register {
3848 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3851 auto getZero64 = [&]() ->
Register {
3853 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3858 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3869 if (CarryIn.empty())
3872 bool HaveCarryOut =
true;
3874 if (CarryIn.size() == 1) {
3876 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3880 CarryAccum = getZero32();
3882 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3883 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3885 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3890 LocalAccum = getZero32();
3891 HaveCarryOut =
false;
3896 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3897 LocalAccum =
Add.getReg(0);
3911 auto buildMadChain =
3914 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3915 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3922 if (LocalAccum.size() == 1 &&
3923 (!UsePartialMad64_32 || !CarryIn.empty())) {
3926 unsigned j1 = DstIndex - j0;
3927 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3931 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3933 LocalAccum[0] =
Mul.getReg(0);
3935 if (CarryIn.empty()) {
3936 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3939 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
3945 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3949 if (j0 <= DstIndex) {
3950 bool HaveSmallAccum =
false;
3953 if (LocalAccum[0]) {
3954 if (LocalAccum.size() == 1) {
3955 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
3956 HaveSmallAccum =
true;
3957 }
else if (LocalAccum[1]) {
3958 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
3959 HaveSmallAccum =
false;
3961 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
3962 HaveSmallAccum =
true;
3965 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3967 HaveSmallAccum =
true;
3971 unsigned j1 = DstIndex - j0;
3972 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3976 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
3977 {Src0[j0], Src1[j1], Tmp});
3978 Tmp = Mad.getReg(0);
3979 if (!HaveSmallAccum)
3980 CarryOut.push_back(Mad.getReg(1));
3981 HaveSmallAccum =
false;
3984 }
while (j0 <= DstIndex);
3986 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
3987 LocalAccum[0] = Unmerge.getReg(0);
3988 if (LocalAccum.size() > 1)
3989 LocalAccum[1] = Unmerge.getReg(1);
4016 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4017 Carry OddCarryIn = std::move(OddCarry);
4018 Carry EvenCarryIn = std::move(EvenCarry);
4023 if (2 * i < Accum.
size()) {
4024 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4025 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4030 if (!SeparateOddAlignedProducts) {
4031 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4032 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4034 bool IsHighest = 2 * i >= Accum.
size();
4038 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4044 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4046 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4048 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4051 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4054 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4055 Lo->getOperand(1).getReg());
4056 Accum[2 * i] =
Hi.getReg(0);
4057 SeparateOddCarry =
Hi.getReg(1);
4064 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4065 EvenCarryIn.push_back(CarryOut);
4067 if (2 * i < Accum.
size()) {
4068 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4069 OddCarry.push_back(CarryOut);
4082 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4091 LLT Ty =
MRI.getType(DstReg);
4095 unsigned NumParts =
Size / 32;
4111 for (
unsigned i = 0; i < NumParts; ++i) {
4115 B.buildUnmerge(Src0Parts, Src0);
4116 B.buildUnmerge(Src1Parts, Src1);
4119 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4120 SeparateOddAlignedProducts);
4122 B.buildMergeLikeInstr(DstReg, AccumRegs);
4123 MI.eraseFromParent();
4135 LLT DstTy =
MRI.getType(Dst);
4136 LLT SrcTy =
MRI.getType(Src);
4138 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4139 ? AMDGPU::G_AMDGPU_FFBH_U32
4140 : AMDGPU::G_AMDGPU_FFBL_B32;
4141 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4144 MI.eraseFromParent();
4150 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4153 return ConstVal && *ConstVal == -1;
4160 Register CondDef =
MI.getOperand(0).getReg();
4161 if (!
MRI.hasOneNonDBGUse(CondDef))
4169 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4175 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4184 if (Next == Parent->
end()) {
4188 UncondBrTarget = &*NextMBB;
4190 if (Next->getOpcode() != AMDGPU::G_BR)
4208 *ArgRC,
B.getDebugLoc(), ArgTy);
4212 const unsigned Mask = Arg->
getMask();
4213 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4220 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4221 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4224 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4226 B.buildCopy(DstReg, LiveIn);
4254 Arg = &WorkGroupIDX;
4255 ArgRC = &AMDGPU::SReg_32RegClass;
4259 Arg = &WorkGroupIDY;
4260 ArgRC = &AMDGPU::SReg_32RegClass;
4264 Arg = &WorkGroupIDZ;
4265 ArgRC = &AMDGPU::SReg_32RegClass;
4280 B.buildConstant(DstReg, 0);
4286 B.buildUndef(DstReg);
4290 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4301 MI.eraseFromParent();
4307 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4308 MI.eraseFromParent();
4329 B.buildUndef(DstReg);
4330 MI.eraseFromParent();
4334 if (Arg->isMasked()) {
4348 MI.eraseFromParent();
4355 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4365 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4373 Align Alignment)
const {
4377 "unexpected kernarg parameter type");
4381 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4384 MI.eraseFromParent();
4392 LLT DstTy =
MRI.getType(Dst);
4419 auto FloatY =
B.buildUITOFP(
S32,
Y);
4420 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4421 auto Scale =
B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f7ffffe));
4422 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4423 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4426 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4427 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4428 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4431 auto Q =
B.buildUMulH(
S32,
X, Z);
4432 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4435 auto One =
B.buildConstant(
S32, 1);
4438 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4444 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4447 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4466 auto Unmerge =
B.buildUnmerge(
S32, Val);
4468 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4469 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4471 auto Mad =
B.buildFMAD(
4473 B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4475 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4476 auto Mul1 =
B.buildFMul(
4477 S32, Rcp,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x5f7ffffc)));
4480 auto Mul2 =
B.buildFMul(
4481 S32, Mul1,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x2f800000)));
4482 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4485 auto Mad2 =
B.buildFMAD(
4486 S32, Trunc,
B.buildFConstant(
S32, llvm::bit_cast<float>(0xcf800000)),
4489 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4490 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4492 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4507 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4509 auto Zero64 =
B.buildConstant(
S64, 0);
4510 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4512 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4513 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4515 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4516 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4517 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4519 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4520 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4521 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4523 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4524 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4525 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4526 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4527 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4529 auto Zero32 =
B.buildConstant(
S32, 0);
4530 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4531 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4532 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4534 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4535 Register NumerLo = UnmergeNumer.getReg(0);
4536 Register NumerHi = UnmergeNumer.getReg(1);
4538 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4539 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4540 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4541 Register Mul3_Lo = UnmergeMul3.getReg(0);
4542 Register Mul3_Hi = UnmergeMul3.getReg(1);
4543 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4544 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4545 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4546 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4548 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4549 Register DenomLo = UnmergeDenom.getReg(0);
4550 Register DenomHi = UnmergeDenom.getReg(1);
4553 auto C1 =
B.buildSExt(
S32, CmpHi);
4556 auto C2 =
B.buildSExt(
S32, CmpLo);
4559 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4566 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4567 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4568 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4569 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4571 auto One64 =
B.buildConstant(
S64, 1);
4572 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4578 auto C6 =
B.buildSelect(
4582 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4583 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4585 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4586 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4587 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4593 auto Sel1 =
B.buildSelect(
4600 auto Sel2 =
B.buildSelect(
4611 switch (
MI.getOpcode()) {
4614 case AMDGPU::G_UDIV: {
4615 DstDivReg =
MI.getOperand(0).getReg();
4618 case AMDGPU::G_UREM: {
4619 DstRemReg =
MI.getOperand(0).getReg();
4622 case AMDGPU::G_UDIVREM: {
4623 DstDivReg =
MI.getOperand(0).getReg();
4624 DstRemReg =
MI.getOperand(1).getReg();
4631 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4632 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4633 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4634 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4643 MI.eraseFromParent();
4653 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4654 if (Ty !=
S32 && Ty !=
S64)
4657 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4662 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4663 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4665 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4666 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4668 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4669 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4671 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4672 switch (
MI.getOpcode()) {
4675 case AMDGPU::G_SDIV: {
4676 DstDivReg =
MI.getOperand(0).getReg();
4677 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4680 case AMDGPU::G_SREM: {
4681 DstRemReg =
MI.getOperand(0).getReg();
4682 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4685 case AMDGPU::G_SDIVREM: {
4686 DstDivReg =
MI.getOperand(0).getReg();
4687 DstRemReg =
MI.getOperand(1).getReg();
4688 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4689 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4700 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4701 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4702 B.buildSub(DstDivReg, SignXor, Sign);
4706 auto Sign = LHSign.getReg(0);
4707 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4708 B.buildSub(DstRemReg, SignXor, Sign);
4711 MI.eraseFromParent();
4722 LLT ResTy =
MRI.getType(Res);
4729 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4740 if (CLHS->isExactlyValue(1.0)) {
4741 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4745 MI.eraseFromParent();
4750 if (CLHS->isExactlyValue(-1.0)) {
4751 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4752 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4753 .addUse(FNeg.getReg(0))
4756 MI.eraseFromParent();
4763 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4768 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4771 B.buildFMul(Res,
LHS, RCP, Flags);
4773 MI.eraseFromParent();
4784 LLT ResTy =
MRI.getType(Res);
4790 if (!AllowInaccurateRcp)
4793 auto NegY =
B.buildFNeg(ResTy,
Y);
4794 auto One =
B.buildFConstant(ResTy, 1.0);
4796 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4800 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4801 R =
B.buildFMA(ResTy, Tmp0, R, R);
4803 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4804 R =
B.buildFMA(ResTy, Tmp1, R, R);
4806 auto Ret =
B.buildFMul(ResTy,
X, R);
4807 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4809 B.buildFMA(Res, Tmp2, R, Ret);
4810 MI.eraseFromParent();
4829 auto LHSExt =
B.buildFPExt(
S32,
LHS, Flags);
4830 auto RHSExt =
B.buildFPExt(
S32,
RHS, Flags);
4832 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4833 .addUse(RHSExt.getReg(0))
4836 auto QUOT =
B.buildFMul(
S32, LHSExt, RCP, Flags);
4837 auto RDst =
B.buildFPTrunc(
S16, QUOT, Flags);
4839 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4840 .addUse(RDst.getReg(0))
4845 MI.eraseFromParent();
4858 unsigned SPDenormMode =
4861 if (ST.hasDenormModeInst()) {
4863 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4865 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4866 B.buildInstr(AMDGPU::S_DENORM_MODE)
4867 .addImm(NewDenormModeValue);
4870 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4871 .addImm(SPDenormMode)
4893 auto One =
B.buildFConstant(
S32, 1.0f);
4895 auto DenominatorScaled =
4896 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4901 auto NumeratorScaled =
4902 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4908 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4909 .addUse(DenominatorScaled.getReg(0))
4911 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
4914 const bool HasDynamicDenormals =
4919 if (!PreservesDenormals) {
4920 if (HasDynamicDenormals) {
4921 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4922 B.buildInstr(AMDGPU::S_GETREG_B32)
4923 .addDef(SavedSPDenormMode)
4929 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
4930 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4931 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
4932 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
4933 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
4934 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4936 if (!PreservesDenormals) {
4937 if (HasDynamicDenormals) {
4938 assert(SavedSPDenormMode);
4939 B.buildInstr(AMDGPU::S_SETREG_B32)
4940 .addReg(SavedSPDenormMode)
4946 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
4947 .addUse(Fma4.getReg(0))
4948 .addUse(Fma1.getReg(0))
4949 .addUse(Fma3.getReg(0))
4950 .addUse(NumeratorScaled.getReg(1))
4953 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4954 .addUse(Fmas.getReg(0))
4959 MI.eraseFromParent();
4978 auto One =
B.buildFConstant(
S64, 1.0);
4980 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
4986 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
4988 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
4989 .addUse(DivScale0.getReg(0))
4992 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
4993 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
4994 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
4996 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5002 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5003 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5004 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5013 auto NumUnmerge =
B.buildUnmerge(
S32,
LHS);
5014 auto DenUnmerge =
B.buildUnmerge(
S32,
RHS);
5015 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5016 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5019 Scale1Unmerge.getReg(1));
5021 Scale0Unmerge.getReg(1));
5022 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5024 Scale = DivScale1.getReg(1);
5027 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5028 .addUse(Fma4.getReg(0))
5029 .addUse(Fma3.getReg(0))
5030 .addUse(
Mul.getReg(0))
5034 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5035 .addUse(Fmas.getReg(0))
5040 MI.eraseFromParent();
5052 LLT Ty =
MRI.getType(Res0);
5055 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5058 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5063 auto Fabs =
B.buildFAbs(Ty, Val);
5067 auto Zero =
B.buildConstant(InstrExpTy, 0);
5068 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5069 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5072 B.buildCopy(Res0, Mant);
5073 B.buildSExtOrTrunc(Res1, Exp);
5075 MI.eraseFromParent();
5090 auto Abs =
B.buildFAbs(
S32,
RHS, Flags);
5093 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5094 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5095 auto C2 =
B.buildFConstant(
S32, 1.0f);
5098 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5100 auto Mul0 =
B.buildFMul(
S32,
RHS, Sel, Flags);
5102 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5103 .addUse(Mul0.getReg(0))
5106 auto Mul1 =
B.buildFMul(
S32,
LHS, RCP, Flags);
5108 B.buildFMul(Res, Sel, Mul1, Flags);
5110 MI.eraseFromParent();
5119 unsigned Flags =
MI.getFlags();
5122 auto Ext =
B.buildFPExt(F32,
MI.getOperand(1), Flags);
5123 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5124 .addUse(Ext.getReg(0))
5126 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5127 MI.eraseFromParent();
5137 const unsigned Flags =
MI.getFlags();
5146 MI.eraseFromParent();
5150 auto ScaleThreshold =
B.buildFConstant(F32, 0x1.0p-96f);
5152 auto ScaleUpFactor =
B.buildFConstant(F32, 0x1.0p+32f);
5153 auto ScaledX =
B.buildFMul(F32,
X, ScaleUpFactor, Flags);
5154 auto SqrtX =
B.buildSelect(F32, NeedScale, ScaledX,
X, Flags);
5156 Register SqrtS =
MRI.createGenericVirtualRegister(F32);
5159 .addUse(SqrtX.getReg(0))
5162 auto NegOne =
B.buildConstant(I32, -1);
5163 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5165 auto NegSqrtSNextDown =
B.buildFNeg(F32, SqrtSNextDown, Flags);
5166 auto SqrtVP =
B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5168 auto PosOne =
B.buildConstant(I32, 1);
5169 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5171 auto NegSqrtSNextUp =
B.buildFNeg(F32, SqrtSNextUp, Flags);
5172 auto SqrtVS =
B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5174 auto Zero =
B.buildFConstant(F32, 0.0f);
5178 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5182 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5185 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5186 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5188 auto Half =
B.buildFConstant(F32, 0.5f);
5189 auto SqrtH =
B.buildFMul(F32, SqrtR, Half, Flags);
5190 auto NegSqrtH =
B.buildFNeg(F32, SqrtH, Flags);
5191 auto SqrtE =
B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5192 SqrtH =
B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5193 SqrtS =
B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5194 auto NegSqrtS =
B.buildFNeg(F32, SqrtS, Flags);
5195 auto SqrtD =
B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5196 SqrtS =
B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5199 auto ScaleDownFactor =
B.buildFConstant(F32, 0x1.0p-16f);
5201 auto ScaledDown =
B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5203 SqrtS =
B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5206 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5208 MI.eraseFromParent();
5240 assert(
MRI.getType(Dst) == F64 &&
"only expect to lower f64 sqrt");
5243 unsigned Flags =
MI.getFlags();
5245 auto ScaleConstant =
B.buildFConstant(F64, 0x1.0p-767);
5247 auto ZeroInt =
B.buildConstant(
S32, 0);
5251 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5252 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5253 auto SqrtX =
B.buildFLdexp(F64,
X, ScaleUp, Flags);
5256 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5258 auto Half =
B.buildFConstant(F64, 0.5);
5259 auto SqrtH0 =
B.buildFMul(F64, SqrtY, Half);
5260 auto SqrtS0 =
B.buildFMul(F64, SqrtX, SqrtY);
5262 auto NegSqrtH0 =
B.buildFNeg(F64, SqrtH0);
5263 auto SqrtR0 =
B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5265 auto SqrtS1 =
B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5266 auto SqrtH1 =
B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5268 auto NegSqrtS1 =
B.buildFNeg(F64, SqrtS1);
5269 auto SqrtD0 =
B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5271 auto SqrtS2 =
B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5273 auto NegSqrtS2 =
B.buildFNeg(F64, SqrtS2);
5274 auto SqrtD1 =
B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5276 auto SqrtRet =
B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5279 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5280 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5281 SqrtRet =
B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5290 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5292 MI.eraseFromParent();
5299 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5323 auto Flags =
MI.getFlags();
5325 LLT Ty =
MRI.getType(Dst);
5335 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5345 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5346 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5351 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5353 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5354 MI.eraseFromParent();
5360 case Intrinsic::amdgcn_ds_fadd:
5361 return AMDGPU::G_ATOMICRMW_FADD;
5362 case Intrinsic::amdgcn_ds_fmin:
5363 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5364 case Intrinsic::amdgcn_ds_fmax:
5365 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5381 for (
int I = 6;
I > 3; --
I)
5382 MI.removeOperand(
I);
5384 MI.removeOperand(1);
5395 LLT DstTy =
MRI.getType(DstReg);
5398 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5404 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
5415 Register Pointer =
MI.getOperand(2).getReg();
5417 Register NumRecords =
MI.getOperand(4).getReg();
5422 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5423 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5424 Register LowHalf = Unmerge.getReg(0);
5425 Register HighHalf = Unmerge.getReg(1);
5427 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5428 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5431 std::optional<ValueAndVReg> StrideConst =
5433 if (!StrideConst || !StrideConst->Value.isZero()) {
5436 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5437 uint32_t ShiftedStrideVal = StrideVal << 16;
5438 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5440 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5441 auto ShiftConst =
B.buildConstant(
S32, 16);
5442 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5444 NewHighHalf =
B.buildOr(
S32, Masked, ShiftedStride);
5447 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5448 MI.eraseFromParent();
5465 MI.eraseFromParent();
5473 std::optional<uint32_t> KnownSize =
5475 if (KnownSize.has_value())
5476 B.buildConstant(DstReg, *KnownSize);
5494 MI.eraseFromParent();
5501 unsigned AddrSpace)
const {
5503 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
5507 MI.eraseFromParent();
5517std::pair<Register, unsigned>
5526 std::tie(BaseReg, ImmOffset) =
5530 if (
MRI.getType(BaseReg).isPointer())
5531 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
5541 unsigned Overflow = ImmOffset & ~MaxImm;
5542 ImmOffset -= Overflow;
5543 if ((int32_t)Overflow < 0) {
5544 Overflow += ImmOffset;
5548 if (Overflow != 0) {
5550 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
5552 auto OverflowVal =
B.buildConstant(
S32, Overflow);
5553 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
5558 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
5560 return std::pair(BaseReg, ImmOffset);
5567 bool ImageStore)
const {
5570 LLT StoreVT =
MRI.getType(Reg);
5574 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5577 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5578 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
5589 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
5591 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
5598 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5599 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5601 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
5609 auto Unmerge =
B.buildUnmerge(
S32, Reg);
5610 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5612 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
5630 LLT Ty =
MRI->getType(VData);
5658 bool IsFormat)
const {
5660 LLT Ty =
MRI.getType(VData);
5662 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5675 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5678 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
5682 VIndex =
MI.getOperand(3).getReg();
5685 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5688 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5689 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5693 Format =
MI.getOperand(5 + OpOffset).getImm();
5697 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5703 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5704 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5705 }
else if (IsFormat) {
5706 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5707 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5711 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5714 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5717 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5722 auto MIB =
B.buildInstr(Opc)
5733 MIB.addImm(AuxiliaryData)
5734 .addImm(HasVIndex ? -1 : 0)
5735 .addMemOperand(MMO);
5737 MI.eraseFromParent();
5743 unsigned ImmOffset,
unsigned Format,
5746 auto MIB =
B.buildInstr(Opc)
5757 MIB.addImm(AuxiliaryData)
5758 .addImm(HasVIndex ? -1 : 0)
5759 .addMemOperand(MMO);
5766 bool IsTyped)
const {
5776 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
5777 bool IsTFE =
MI.getNumExplicitDefs() == 2;
5779 StatusDst =
MI.getOperand(1).getReg();
5784 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
5787 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5790 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
5793 VIndex =
MI.getOperand(3 + OpOffset).getReg();
5796 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5799 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5800 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5804 Format =
MI.getOperand(5 + OpOffset).getImm();
5808 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5811 LLT Ty =
MRI.getType(Dst);
5816 Dst =
MI.getOperand(0).getReg();
5819 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5830 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5831 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5832 }
else if (IsFormat) {
5836 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5838 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5839 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5846 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5849 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5852 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5859 unsigned NumLoadDWords = NumValueDWords + 1;
5861 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
5862 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5863 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5864 if (NumValueDWords == 1) {
5865 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5868 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
5869 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
5871 B.buildUnmerge(LoadElts, LoadDstReg);
5873 B.buildMergeLikeInstr(Dst, LoadElts);
5877 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
5878 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5879 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5880 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5881 B.buildTrunc(Dst, LoadDstReg);
5882 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
5884 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5885 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5886 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5887 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5889 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
5891 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
5892 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
5893 B.buildMergeLikeInstr(Dst, Repack);
5896 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5899 MI.eraseFromParent();
5905 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5907 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5908 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5909 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5910 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5912 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5913 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5914 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5915 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5916 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5917 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5919 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5920 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5922 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5923 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5924 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5925 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5927 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5929 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5930 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5931 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5932 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5934 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5935 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5936 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5937 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5938 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5939 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5940 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5941 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5942 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5943 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5944 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5945 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5946 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5947 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5949 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5950 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5951 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5952 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5954 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5955 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5956 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5957 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5959 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5960 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5961 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5962 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5963 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5964 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5965 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5966 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5967 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5969 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5970 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5971 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5972 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5973 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5974 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5975 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5976 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5977 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5978 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5979 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5980 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5981 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5982 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5983 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5984 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5985 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5986 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5987 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5988 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5989 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5990 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
5999 const bool IsCmpSwap =
6000 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6001 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6002 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6003 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6014 CmpVal =
MI.getOperand(3).getReg();
6019 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6020 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6023 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6026 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6029 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6032 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6033 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6034 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6053 .addImm(AuxiliaryData)
6054 .addImm(HasVIndex ? -1 : 0)
6055 .addMemOperand(MMO);
6057 MI.eraseFromParent();
6067 bool IsA16,
bool IsG16) {
6070 auto EndIdx =
Intr->VAddrEnd;
6072 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
6079 if ((I < Intr->GradientStart) ||
6080 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6081 (
I >=
Intr->CoordStart && !IsA16)) {
6082 if ((I < Intr->GradientStart) && IsA16 &&
6083 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6084 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
6088 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6092 "Bias needs to be converted to 16 bit in A16 mode");
6094 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6100 if (((
I + 1) >= EndIdx) ||
6101 ((
Intr->NumGradients / 2) % 2 == 1 &&
6102 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
6103 (
Intr->NumGradients / 2) - 1) ||
6104 I ==
static_cast<unsigned>(
Intr->GradientStart +
6105 Intr->NumGradients - 1))) ||
6107 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6109 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6114 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6125 int DimIdx,
int NumVAddrs) {
6129 for (
int I = 0;
I != NumVAddrs; ++
I) {
6131 if (
SrcOp.isReg()) {
6137 int NumAddrRegs = AddrRegs.
size();
6138 if (NumAddrRegs != 1) {
6141 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6144 for (
int I = 1;
I != NumVAddrs; ++
I) {
6147 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6169 const unsigned NumDefs =
MI.getNumExplicitDefs();
6170 const unsigned ArgOffset = NumDefs + 1;
6171 bool IsTFE = NumDefs == 2;
6185 Register VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6186 LLT Ty =
MRI->getType(VData);
6188 const bool IsAtomicPacked16Bit =
6189 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6190 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6194 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
6196 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
6199 const bool IsA16 = AddrTy ==
S16;
6203 if (!BaseOpcode->
Atomic) {
6204 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
6207 }
else if (DMask != 0) {
6209 }
else if (!IsTFE && !BaseOpcode->
Store) {
6211 B.buildUndef(
MI.getOperand(0));
6212 MI.eraseFromParent();
6220 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6221 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6222 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6223 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6224 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6227 MI.setDesc(
B.getTII().get(NewOpcode));
6231 if (IsTFE && DMask == 0) {
6234 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
6237 if (BaseOpcode->
Atomic) {
6239 LLT Ty =
MRI->getType(VData0);
6242 if (Ty.
isVector() && !IsAtomicPacked16Bit)
6249 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6250 MI.getOperand(2).setReg(
Concat.getReg(0));
6251 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6255 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
6264 if (IsA16 && !ST.
hasA16()) {
6272 if (IsA16 || IsG16) {
6282 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6283 const bool UsePartialNSA =
6284 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6286 if (UsePartialNSA) {
6290 auto Concat =
B.buildConcatVectors(
6291 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6292 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6293 PackedRegs.
resize(NSAMaxSize);
6294 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6296 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6297 PackedRegs[0] =
Concat.getReg(0);
6301 const unsigned NumPacked = PackedRegs.
size();
6302 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
6304 if (!
SrcOp.isReg()) {
6311 if (
I -
Intr->VAddrStart < NumPacked)
6312 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
6314 SrcOp.setReg(AMDGPU::NoRegister);
6333 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6334 const bool UsePartialNSA =
6335 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6337 if (UsePartialNSA) {
6339 ArgOffset +
Intr->VAddrStart + NSAMaxSize - 1,
6340 Intr->NumVAddrs - NSAMaxSize + 1);
6341 }
else if (!UseNSA &&
Intr->NumVAddrs > 1) {
6354 if (BaseOpcode->
Store) {
6360 if (RepackedReg != VData) {
6361 MI.getOperand(1).setReg(RepackedReg);
6372 if (NumElts < DMaskLanes)
6375 if (NumElts > 4 || DMaskLanes > 4)
6385 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6386 const LLT AdjustedTy =
6409 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6410 unsigned RoundedSize = 32 * RoundedElts;
6414 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6419 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6425 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6429 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6430 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6432 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6434 MI.getOperand(0).setReg(NewResultReg);
6442 Dst1Reg =
MI.getOperand(1).getReg();
6443 if (
MRI->getType(Dst1Reg) !=
S32)
6447 MI.removeOperand(1);
6451 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6460 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6462 if (ResultNumRegs == 1) {
6464 ResultRegs[0] = NewResultReg;
6467 for (
int I = 0;
I != NumDataRegs; ++
I)
6468 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
6469 B.buildUnmerge(ResultRegs, NewResultReg);
6474 ResultRegs.
resize(NumDataRegs);
6480 B.buildTrunc(DstReg, ResultRegs[0]);
6486 B.buildBitcast(DstReg, ResultRegs[0]);
6500 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
6503 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
6507 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
6510 Register Undef =
B.buildUndef(Ty).getReg(0);
6511 for (
int I = 0;
I != NumElts; ++
I)
6516 LLT ResTy =
MRI->getType(ResultRegs[0]);
6518 padWithUndef(ResTy, NumElts - ResultRegs.
size());
6519 B.buildBuildVector(DstReg, ResultRegs);
6530 if (ResultRegs.
size() == 1) {
6531 NewResultReg = ResultRegs[0];
6532 }
else if (ResultRegs.
size() == 2) {
6534 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
6540 if (
MRI->getType(DstReg).getNumElements() <
6541 MRI->getType(NewResultReg).getNumElements()) {
6542 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6544 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6549 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
6550 B.buildConcatVectors(DstReg, ResultRegs);
6559 Register OrigDst =
MI.getOperand(0).getReg();
6561 LLT Ty =
B.getMRI()->getType(OrigDst);
6567 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6568 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6571 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
6573 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6582 B.setInsertPt(
B.getMBB(),
MI);
6587 B.setInsertPt(
B.getMBB(),
MI);
6593 MI.setDesc(
B.getTII().get(Opc));
6594 MI.removeOperand(1);
6598 const unsigned MemSize = (
Size + 7) / 8;
6599 const Align MemAlign(std::min(MemSize, 4u));
6605 MI.addMemOperand(MF, MMO);
6606 if (Dst != OrigDst) {
6607 MI.getOperand(0).setReg(Dst);
6608 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6609 B.buildTrunc(OrigDst, Dst);
6647 MI.eraseFromParent();
6657 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
6659 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6663 MI.eraseFromParent();
6672 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6681 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
6697 Register LoadAddr =
MRI.createGenericVirtualRegister(
6699 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6702 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
6703 B.buildCopy(SGPR01, Temp);
6704 B.buildInstr(AMDGPU::S_TRAP)
6707 MI.eraseFromParent();
6718 B.buildCopy(SGPR01, LiveIn);
6719 B.buildInstr(AMDGPU::S_TRAP)
6723 MI.eraseFromParent();
6729 B.buildInstr(AMDGPU::S_TRAP)
6731 MI.eraseFromParent();
6743 "debugtrap handler not supported",
6745 LLVMContext &Ctx =
B.getMF().getFunction().getContext();
6749 B.buildInstr(AMDGPU::S_TRAP)
6753 MI.eraseFromParent();
6766 Register NodePtr =
MI.getOperand(2).getReg();
6767 Register RayExtent =
MI.getOperand(3).getReg();
6768 Register RayOrigin =
MI.getOperand(4).getReg();
6770 Register RayInvDir =
MI.getOperand(6).getReg();
6775 "intrinsic not supported on subtarget",
6777 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6784 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6785 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
6786 const unsigned NumVDataDwords = 4;
6787 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6788 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6792 const unsigned BaseOpcodes[2][2] = {
6793 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6794 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6795 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6799 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6800 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6801 : AMDGPU::MIMGEncGfx10NSA,
6802 NumVDataDwords, NumVAddrDwords);
6806 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6807 : AMDGPU::MIMGEncGfx10Default,
6808 NumVDataDwords, NumVAddrDwords);
6813 if (UseNSA && IsGFX11Plus) {
6815 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
6816 auto Merged =
B.buildMergeLikeInstr(
6817 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6823 packLanes(RayOrigin);
6826 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
6827 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
6828 auto MergedDir =
B.buildMergeLikeInstr(
6831 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
6832 UnmergeRayDir.getReg(0)}))
6835 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
6836 UnmergeRayDir.getReg(1)}))
6839 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
6840 UnmergeRayDir.getReg(2)}))
6845 packLanes(RayInvDir);
6849 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
6858 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
6864 packLanes(RayOrigin);
6866 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
6867 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
6871 B.buildMergeLikeInstr(R1,
6872 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6873 B.buildMergeLikeInstr(
6874 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6875 B.buildMergeLikeInstr(
6876 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6882 packLanes(RayInvDir);
6889 Register MergedOps =
B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6894 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6903 .addImm(IsA16 ? 1 : 0)
6906 MI.eraseFromParent();
6913 int RoundMode =
MI.getOperand(2).getImm();
6916 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6918 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6923 .addDef(
MI.getOperand(0).getReg())
6924 .addUse(
MI.getOperand(1).getReg());
6926 MI.eraseFromParent();
6936 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6937 MI.eraseFromParent();
6948 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
6949 auto LSB =
B.buildConstant(
S32, 25);
6950 auto Width =
B.buildConstant(
S32, 5);
6951 B.buildUbfx(DstReg, TTMP8, LSB, Width);
6952 MI.eraseFromParent();
6966 if (
MRI.getType(Src) !=
S64)
6970 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
6974 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
6977 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
6978 MI.eraseFromParent();
6986 if (
MRI.getType(Src) !=
S64)
6989 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
6993 .addReg(Unmerge.getReg(0));
6997 .addReg(Unmerge.getReg(1));
6998 MI.eraseFromParent();
7008 auto IntrID = cast<GIntrinsic>(
MI).getIntrinsicID();
7010 case Intrinsic::amdgcn_if:
7011 case Intrinsic::amdgcn_else: {
7014 bool Negated =
false;
7026 std::swap(CondBrTarget, UncondBrTarget);
7028 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7029 if (IntrID == Intrinsic::amdgcn_if) {
7030 B.buildInstr(AMDGPU::SI_IF)
7033 .addMBB(UncondBrTarget);
7035 B.buildInstr(AMDGPU::SI_ELSE)
7038 .addMBB(UncondBrTarget);
7047 B.buildBr(*CondBrTarget);
7050 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7051 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7052 MI.eraseFromParent();
7053 BrCond->eraseFromParent();
7059 case Intrinsic::amdgcn_loop: {
7062 bool Negated =
false;
7072 std::swap(CondBrTarget, UncondBrTarget);
7074 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7075 B.buildInstr(AMDGPU::SI_LOOP)
7077 .addMBB(UncondBrTarget);
7082 B.buildBr(*CondBrTarget);
7084 MI.eraseFromParent();
7085 BrCond->eraseFromParent();
7086 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7092 case Intrinsic::amdgcn_addrspacecast_nonnull:
7094 case Intrinsic::amdgcn_make_buffer_rsrc:
7096 case Intrinsic::amdgcn_kernarg_segment_ptr:
7099 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7100 MI.eraseFromParent();
7106 case Intrinsic::amdgcn_implicitarg_ptr:
7108 case Intrinsic::amdgcn_workitem_id_x:
7111 case Intrinsic::amdgcn_workitem_id_y:
7114 case Intrinsic::amdgcn_workitem_id_z:
7117 case Intrinsic::amdgcn_workgroup_id_x:
7120 case Intrinsic::amdgcn_workgroup_id_y:
7123 case Intrinsic::amdgcn_workgroup_id_z:
7126 case Intrinsic::amdgcn_wave_id:
7128 case Intrinsic::amdgcn_lds_kernel_id:
7131 case Intrinsic::amdgcn_dispatch_ptr:
7134 case Intrinsic::amdgcn_queue_ptr:
7137 case Intrinsic::amdgcn_implicit_buffer_ptr:
7140 case Intrinsic::amdgcn_dispatch_id:
7143 case Intrinsic::r600_read_ngroups_x:
7147 case Intrinsic::r600_read_ngroups_y:
7150 case Intrinsic::r600_read_ngroups_z:
7153 case Intrinsic::r600_read_local_size_x:
7156 case Intrinsic::r600_read_local_size_y:
7160 case Intrinsic::r600_read_local_size_z:
7162 case Intrinsic::r600_read_global_size_x:
7164 case Intrinsic::r600_read_global_size_y:
7166 case Intrinsic::r600_read_global_size_z:
7168 case Intrinsic::amdgcn_fdiv_fast:
7170 case Intrinsic::amdgcn_is_shared:
7172 case Intrinsic::amdgcn_is_private:
7174 case Intrinsic::amdgcn_wavefrontsize: {
7176 MI.eraseFromParent();
7179 case Intrinsic::amdgcn_s_buffer_load:
7181 case Intrinsic::amdgcn_raw_buffer_store:
7182 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7183 case Intrinsic::amdgcn_struct_buffer_store:
7184 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7186 case Intrinsic::amdgcn_raw_buffer_store_format:
7187 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7188 case Intrinsic::amdgcn_struct_buffer_store_format:
7189 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7191 case Intrinsic::amdgcn_raw_tbuffer_store:
7192 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7193 case Intrinsic::amdgcn_struct_tbuffer_store:
7194 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7196 case Intrinsic::amdgcn_raw_buffer_load:
7197 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7198 case Intrinsic::amdgcn_struct_buffer_load:
7199 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7201 case Intrinsic::amdgcn_raw_buffer_load_format:
7202 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7203 case Intrinsic::amdgcn_struct_buffer_load_format:
7204 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7206 case Intrinsic::amdgcn_raw_tbuffer_load:
7207 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7208 case Intrinsic::amdgcn_struct_tbuffer_load:
7209 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7211 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7212 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7213 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7214 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7215 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7216 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7217 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7218 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7219 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7220 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7221 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7222 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7223 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7224 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7225 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7226 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7227 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7228 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7229 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7230 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7231 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7232 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7233 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7234 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7235 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7236 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7237 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7239 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7240 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7241 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7242 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7243 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7244 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7245 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7246 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7247 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7248 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7249 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7250 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7251 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7252 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7253 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7254 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7255 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7256 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7257 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7259 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7260 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7261 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7262 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7263 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7264 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7265 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7266 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7267 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7268 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7269 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7271 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7273 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7274 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7275 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7276 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7277 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7278 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7280 case Intrinsic::amdgcn_rsq_clamp:
7282 case Intrinsic::amdgcn_ds_fadd:
7283 case Intrinsic::amdgcn_ds_fmin:
7284 case Intrinsic::amdgcn_ds_fmax:
7286 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7288 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7289 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7290 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7291 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7292 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7293 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7294 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7295 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7299 MI.getOperand(5).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7302 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7303 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7304 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7308 MI.getOperand(7).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7311 case Intrinsic::amdgcn_fmed3: {
7317 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7318 MI.removeOperand(1);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static constexpr unsigned MaxRegisterSize
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasArchitectedSGPRs() const
const SIInstrInfo * getInstrInfo() const override
bool hasScalarSubwordLoads() const
bool supportsGetDoorbellID() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
bool hasNSAEncoding() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAddNoCarry() const
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
constexpr bool isPointerVector() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr ElementCount getElementCount() const
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr bool isPointerOrPointerVector() const
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
StringRef getName() const
Return a constant reference to the value's name.
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
@ Mul
Product of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static const fltSemantics & IEEEsingle() LLVM_READNONE
static const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.