34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
367 const LLT Ty = Query.Types[TypeIdx];
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
377 bool IsLoad,
bool IsAtomic) {
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
394 return IsLoad ? 512 : 128;
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
408 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
413 unsigned AS = Query.
Types[1].getAddressSpace();
427 if (IsLoad && MemSize <
Size)
428 MemSize = std::max(MemSize,
Align);
437 AtomicOrdering::NotAtomic))
448 if (!ST.hasDwordx3LoadStores())
461 if (AlignBits < MemSize) {
464 Align(AlignBits / 8)))
507 return EltSize != 32 && EltSize != 64;
522 if (
Size != MemSizeInBits)
538 uint64_t AlignInBits,
unsigned AddrSpace,
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
559 if (AlignInBits < RoundedSize)
566 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
573 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
578 Query.
Types[1].getAddressSpace(), Opcode);
598 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
601 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
604 for (
unsigned I = 0;
I < NumParts; ++
I)
606 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
611 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
613 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
634 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
635 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
658 using namespace TargetOpcode;
660 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
673 const LLT BufferStridedPtr =
676 const LLT CodePtr = FlatPtr;
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688 const std::initializer_list<LLT> FPTypesBase = {
692 const std::initializer_list<LLT> FPTypes16 = {
696 const std::initializer_list<LLT> FPTypesPK16 = {
726 .clampMaxNumElementsStrict(0,
S16, 2)
734 .clampMaxNumElementsStrict(0,
S16, 2)
744 .clampMaxNumElementsStrict(0,
S16, 2)
752 .clampMaxNumElementsStrict(0,
S16, 2)
762 .minScalarOrElt(0,
S16)
779 .widenScalarToNextMultipleOf(0, 32)
801 .widenScalarToNextMultipleOf(0, 32)
809 .widenScalarToNextMultipleOf(0, 32);
820 .minScalarOrElt(0,
S32)
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
851 .clampMaxNumElements(0,
S8, 2)
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
889 .clampScalar(0,
S16,
S64);
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
979 .legalFor(FPTypesPK16)
993 .clampScalar(0,
S16,
S64);
1018 .clampScalar(0,
S32,
S64);
1023 .clampScalar(0,
S32,
S64);
1029 .clampScalar(0,
S32,
S64)
1030 .clampScalar(1,
S32,
S32)
1037 .clampScalar(1,
S32,
S32)
1073 FMad.customFor({
S32,
S16});
1075 FMad.customFor({
S32});
1077 FMad.customFor({
S16});
1085 FRem.minScalar(0,
S32)
1094 .clampMaxNumElements(0,
S16, 2)
1105 .clampScalar(0,
S32,
S64)
1106 .widenScalarToNextPow2(1, 32);
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0,
S16,
S64)
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0,
S16,
S64)
1154 if (
ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1158 .clampScalar(0,
S16,
S64)
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1164 .clampScalar(0,
S32,
S64)
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1171 .clampScalar(0,
S32,
S64)
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1179 .scalarSameSizeAs(1, 0);
1181 getActionDefinitionsBuilder(G_PTRMASK)
1183 .scalarSameSizeAs(1, 0)
1187 getActionDefinitionsBuilder(G_ICMP)
1198 .legalForCartesianProduct(
1199 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (
ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{
S1,
S16}});
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1,
S32,
S64)
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1216 if (
ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1,
S32,
S64)
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (
ST.has16BitInsts())
1227 ExpOps.customFor({{
S32}, {
S16}});
1229 ExpOps.customFor({
S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy,
S32)
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({
S32});
1239 if (
ST.has16BitInsts())
1240 Log2Ops.legalFor({
S16});
1242 Log2Ops.customFor({
S16});
1243 Log2Ops.scalarize(0)
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({
S32,
S16});
1249 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1253 getActionDefinitionsBuilder(G_CTPOP)
1255 .clampScalar(0,
S32,
S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1,
S32,
S64)
1259 .widenScalarToNextPow2(0, 32);
1262 if (
ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({
S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({
S1}, FPTypesBase)
1271 .lowerFor({
S1,
S16})
1272 .widenScalarToNextPow2(1)
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1281 .clampScalar(0,
S32,
S32)
1282 .clampScalar(1,
S32,
S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1291 .clampScalar(0,
S32,
S32)
1292 .clampScalar(1,
S32,
S64)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1299 .clampScalar(0,
S32,
S32)
1300 .clampScalar(1,
S32,
S64)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1309 .clampScalar(0,
S32,
S64)
1311 .widenScalarToNextPow2(0);
1313 if (
ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1316 .clampMaxNumElementsStrict(0,
S16, 2)
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0,
S16,
S32)
1323 if (
ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1326 .clampMaxNumElements(0,
S16, 2)
1328 .widenScalarToNextPow2(0)
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1334 .widenScalarToNextPow2(0)
1341 getActionDefinitionsBuilder(G_BSWAP)
1346 .widenScalarToNextPow2(0)
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354 .widenScalarToNextPow2(0)
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1361 .legalForCartesianProduct(AddrSpaces64, {
S64})
1362 .legalForCartesianProduct(AddrSpaces32, {
S32})
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1377 .legalForCartesianProduct(AddrSpaces64, {
S64})
1378 .legalForCartesianProduct(AddrSpaces32, {
S32})
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1395 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1396 bool IsLoad) ->
bool {
1400 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1414 unsigned NumRegs = (MemSize + 31) / 32;
1416 if (!
ST.hasDwordx3LoadStores())
1427 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1435 for (
unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore =
Op == G_STORE;
1438 auto &Actions = getActionDefinitionsBuilder(
Op);
1441 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1444 {
S64, GlobalPtr,
S64, GlobalAlign32},
1447 {
S32, GlobalPtr,
S8, GlobalAlign8},
1448 {
S32, GlobalPtr,
S16, GlobalAlign16},
1450 {
S32, LocalPtr,
S32, 32},
1451 {
S64, LocalPtr,
S64, 32},
1453 {
S32, LocalPtr,
S8, 8},
1454 {
S32, LocalPtr,
S16, 16},
1457 {
S32, PrivatePtr,
S32, 32},
1458 {
S32, PrivatePtr,
S8, 8},
1459 {
S32, PrivatePtr,
S16, 16},
1462 {
S32, ConstantPtr,
S32, GlobalAlign32},
1465 {
S64, ConstantPtr,
S64, GlobalAlign32},
1466 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1490 Actions.customIf(
typeIs(1, Constant32Ptr));
1516 return !Query.
Types[0].isVector() &&
1517 needToSplitMemOp(Query,
Op == G_LOAD);
1519 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1524 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1527 if (DstSize > MemSize)
1533 if (MemSize > MaxSize)
1541 return Query.
Types[0].isVector() &&
1542 needToSplitMemOp(Query,
Op == G_LOAD);
1544 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1558 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1563 if (MaxSize % EltSize == 0) {
1569 unsigned NumPieces = MemSize / MaxSize;
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1583 return std::pair(0, EltTy);
1598 return std::pair(0, EltTy);
1602 .widenScalarToNextPow2(0)
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1610 {
S32, GlobalPtr,
S16, 2 * 8},
1611 {
S32, LocalPtr,
S8, 8},
1612 {
S32, LocalPtr,
S16, 16},
1613 {
S32, PrivatePtr,
S8, 8},
1614 {
S32, PrivatePtr,
S16, 16},
1615 {
S32, ConstantPtr,
S8, 8},
1616 {
S32, ConstantPtr,
S16, 2 * 8}})
1622 if (
ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1632 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1634 ExtLoads.clampScalar(0,
S32,
S32)
1635 .widenScalarToNextPow2(0)
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1644 {
S64, GlobalPtr}, {
S64, LocalPtr},
1645 {
S32, RegionPtr}, {
S64, RegionPtr}});
1646 if (
ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (
ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1654 if (
ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{
S64, LocalPtr}});
1656 if (
ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1659 if (
ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{
S32, GlobalPtr}});
1661 if (
ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{
S32, FlatPtr}});
1664 if (
ST.hasGFX90AInsts()) {
1675 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1678 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1680 if (
ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1690 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1692 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1694 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1696 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1703 {
S32, FlatPtr}, {
S64, FlatPtr}})
1704 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1705 {
S32, RegionPtr}, {
S64, RegionPtr}});
1709 getActionDefinitionsBuilder(G_SELECT)
1711 LocalPtr, FlatPtr, PrivatePtr,
1715 .clampScalar(0,
S16,
S64)
1719 .clampMaxNumElements(0,
S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1723 .widenScalarToNextPow2(0)
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1730 if (
ST.has16BitInsts()) {
1731 if (
ST.hasVOP3PInsts()) {
1733 .clampMaxNumElements(0,
S16, 2);
1735 Shifts.legalFor({{
S16,
S16}});
1738 Shifts.widenScalarIf(
1743 const LLT AmountTy = Query.
Types[1];
1748 Shifts.clampScalar(1,
S32,
S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0,
S16,
S64);
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1760 Shifts.clampScalar(1,
S32,
S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0,
S32,
S64);
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1769 Shifts.scalarize(0);
1771 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1776 getActionDefinitionsBuilder(
Op)
1778 const LLT EltTy = Query.
Types[EltTypeIdx];
1779 const LLT VecTy = Query.
Types[VecTypeIdx];
1780 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1782 const bool isLegalVecType =
1792 return (EltSize == 32 || EltSize == 64) &&
1807 const LLT EltTy = Query.
Types[EltTypeIdx];
1808 const LLT VecTy = Query.
Types[VecTypeIdx];
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1817 .clampScalar(EltTypeIdx,
S32,
S64)
1818 .clampScalar(VecTypeIdx,
S32,
S64)
1819 .clampScalar(IdxTypeIdx,
S32,
S32)
1820 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1832 const LLT &EltTy = Query.
Types[1].getElementType();
1833 return Query.
Types[0] != EltTy;
1836 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1841 getActionDefinitionsBuilder(
Op)
1847 const LLT BigTy = Query.
Types[BigTyIdx];
1852 const LLT BigTy = Query.
Types[BigTyIdx];
1853 const LLT LitTy = Query.
Types[LitTyIdx];
1859 const LLT BigTy = Query.
Types[BigTyIdx];
1865 const LLT LitTy = Query.
Types[LitTyIdx];
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1884 if (
ST.hasScalarPackInsts()) {
1887 .minScalarOrElt(0,
S16)
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1894 BuildVector.customFor({
V2S16,
S16});
1895 BuildVector.minScalarOrElt(0,
S32);
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1907 .clampMaxNumElements(0,
S32, 32)
1908 .clampMaxNumElements(1,
S16, 2)
1909 .clampMaxNumElements(0,
S16, 64);
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1914 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1918 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1919 const LLT Ty = Query.
Types[TypeIdx];
1930 auto &Builder = getActionDefinitionsBuilder(
Op)
1934 const LLT BigTy = Query.
Types[BigTyIdx];
1940 .widenScalarToNextPow2(LitTyIdx, 16)
1948 .clampScalar(LitTyIdx,
S32,
S512)
1949 .widenScalarToNextPow2(LitTyIdx, 32)
1952 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1955 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1959 if (
Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1963 const LLT Ty = Query.
Types[LitTyIdx];
1969 Builder.widenScalarIf(
1971 const LLT Ty = Query.
Types[BigTyIdx];
1977 const LLT &Ty = Query.
Types[BigTyIdx];
1979 if (NewSizeInBits >= 256) {
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1984 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{
S32}, {
S64}});
1996 if (
ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{
V2S16}})
2001 .clampMaxNumElementsStrict(0,
S16, 2);
2002 }
else if (
ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2007 SextInReg.lowerFor({{
S32}, {
S64}});
2012 .clampScalar(0,
S32,
S64)
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2020 getActionDefinitionsBuilder(G_FSHR)
2023 .clampMaxNumElementsStrict(0,
S16, 2)
2027 if (
ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2030 .clampMaxNumElementsStrict(0,
S16, 2)
2034 getActionDefinitionsBuilder(G_FSHL)
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2044 getActionDefinitionsBuilder(G_FENCE)
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2054 .clampScalar(1,
S32,
S32)
2055 .clampScalar(0,
S32,
S64)
2056 .widenScalarToNextPow2(0)
2059 getActionDefinitionsBuilder(
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2069 if (
ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0,
S16, 2)
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2091 getLegacyLegalizerInfo().computeTables();
2101 switch (
MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2106 case TargetOpcode::G_FCEIL:
2108 case TargetOpcode::G_FREM:
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2112 case TargetOpcode::G_SITOFP:
2114 case TargetOpcode::G_UITOFP:
2116 case TargetOpcode::G_FPTOSI:
2118 case TargetOpcode::G_FPTOUI:
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2132 case TargetOpcode::G_GLOBAL_VALUE:
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2138 case TargetOpcode::G_STORE:
2140 case TargetOpcode::G_FMAD:
2142 case TargetOpcode::G_FDIV:
2144 case TargetOpcode::G_FFREXP:
2146 case TargetOpcode::G_FSQRT:
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2158 case TargetOpcode::G_FLOG2:
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2163 case TargetOpcode::G_FEXP2:
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2168 case TargetOpcode::G_FPOW:
2170 case TargetOpcode::G_FFLOOR:
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2175 case TargetOpcode::G_MUL:
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2184 case TargetOpcode::G_STACKSAVE:
2186 case TargetOpcode::G_GET_FPENV:
2188 case TargetOpcode::G_SET_FPENV:
2190 case TargetOpcode::G_TRAP:
2192 case TargetOpcode::G_DEBUGTRAP:
2212 if (ST.hasApertureRegs()) {
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2229 return B.buildUnmerge(
S32, Dst).getReg(1);
2234 Register LoadAddr =
MRI.createGenericVirtualRegister(
2244 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2246 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2263 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2266 Register QueuePtr =
MRI.createGenericVirtualRegister(
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2315 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2321 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2322 :
MI.getOperand(1).getReg();
2323 LLT DstTy =
MRI.getType(Dst);
2324 LLT SrcTy =
MRI.getType(Src);
2335 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2352 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2354 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2355 auto FlatNull =
B.buildConstant(SrcTy, 0);
2358 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2364 MI.eraseFromParent();
2376 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2380 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2385 B.buildCopy(Dst, BuildPtr);
2386 MI.eraseFromParent();
2390 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2391 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2394 SegmentNull.getReg(0));
2396 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2398 MI.eraseFromParent();
2405 B.buildExtract(Dst, Src, 0);
2406 MI.eraseFromParent();
2414 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2415 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2416 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2417 MI.eraseFromParent();
2422 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2425 Ctx.
diagnose(InvalidAddrSpaceCast);
2427 MI.eraseFromParent();
2435 LLT Ty =
MRI.getType(Src);
2441 auto C1 =
B.buildFConstant(Ty, C1Val);
2442 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2445 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2446 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2448 auto C2 =
B.buildFConstant(Ty, C2Val);
2449 auto Fabs =
B.buildFAbs(Ty, Src);
2452 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2453 MI.eraseFromParent();
2471 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2473 const auto Zero =
B.buildFConstant(
S64, 0.0);
2474 const auto One =
B.buildFConstant(
S64, 1.0);
2477 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2478 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2481 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2482 MI.eraseFromParent();
2490 Register Src0Reg =
MI.getOperand(1).getReg();
2491 Register Src1Reg =
MI.getOperand(2).getReg();
2492 auto Flags =
MI.getFlags();
2493 LLT Ty =
MRI.getType(DstReg);
2495 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2496 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2497 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2498 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2499 MI.eraseFromParent();
2505 const unsigned FractBits = 52;
2506 const unsigned ExpBits = 11;
2509 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2510 auto Const1 =
B.buildConstant(
S32, ExpBits);
2512 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2514 .addUse(Const0.getReg(0))
2515 .addUse(Const1.getReg(0));
2517 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2531 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2538 const unsigned FractBits = 52;
2541 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2542 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2544 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2546 const auto Zero32 =
B.buildConstant(
S32, 0);
2549 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2551 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2552 auto Not =
B.buildNot(
S64, Shr);
2553 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2554 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2559 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2560 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2561 MI.eraseFromParent();
2577 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2578 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2580 if (
MRI.getType(Dst) ==
S64) {
2581 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2582 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2584 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2585 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2588 B.buildFAdd(Dst, LdExp, CvtLo);
2589 MI.eraseFromParent();
2595 auto One =
B.buildConstant(
S32, 1);
2599 auto ThirtyOne =
B.buildConstant(
S32, 31);
2600 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2601 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2602 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2603 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2604 .addUse(Unmerge.getReg(1));
2605 auto LS2 =
B.buildSub(
S32, LS, One);
2606 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2608 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2609 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2610 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2611 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2612 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2613 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2614 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2615 B.buildFLdexp(Dst, FVal, Scale);
2616 MI.eraseFromParent();
2633 const LLT SrcLT =
MRI.getType(Src);
2636 unsigned Flags =
MI.getFlags();
2647 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2655 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2656 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2660 K0 =
B.buildFConstant(
2661 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2662 K1 =
B.buildFConstant(
2663 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2665 K0 =
B.buildFConstant(
2666 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2667 K1 =
B.buildFConstant(
2668 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2671 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2672 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2673 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2676 :
B.buildFPTOUI(
S32, FloorMul);
2677 auto Lo =
B.buildFPTOUI(
S32, Fma);
2681 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2683 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2686 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2687 MI.eraseFromParent();
2697 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2698 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2721 LLT VecTy =
MRI.getType(Vec);
2734 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2735 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2736 B.buildIntToPtr(Dst, IntElt);
2738 MI.eraseFromParent();
2745 std::optional<ValueAndVReg> MaybeIdxVal =
2749 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2752 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2753 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2758 MI.eraseFromParent();
2773 LLT VecTy =
MRI.getType(Vec);
2787 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2788 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2789 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2791 B.buildIntToPtr(Dst, IntVecDest);
2792 MI.eraseFromParent();
2799 std::optional<ValueAndVReg> MaybeIdxVal =
2804 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2807 if (IdxVal < NumElts) {
2809 for (
unsigned i = 0; i < NumElts; ++i)
2810 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2811 B.buildUnmerge(SrcRegs, Vec);
2813 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2814 B.buildMergeLikeInstr(Dst, SrcRegs);
2819 MI.eraseFromParent();
2829 LLT Ty =
MRI.getType(DstReg);
2830 unsigned Flags =
MI.getFlags();
2835 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2836 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2837 .addUse(MulVal.getReg(0))
2841 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2844 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2848 MI.eraseFromParent();
2856 unsigned GAFlags)
const {
2857 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2885 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2896 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2897 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2900 B.buildExtract(DstReg, PCReg, 0);
2914 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2916 :
MRI.createGenericVirtualRegister(
S32);
2918 if (!
MRI.getRegClassOrNull(AddrLo))
2919 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2922 B.buildInstr(AMDGPU::S_MOV_B32)
2927 if (RequiresHighHalf) {
2929 "Must provide a 64-bit pointer type!");
2932 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2934 B.buildInstr(AMDGPU::S_MOV_B32)
2944 if (!
MRI.getRegClassOrNull(AddrDst))
2945 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2947 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2951 if (AddrDst != DstReg)
2952 B.buildCast(DstReg, AddrDst);
2953 }
else if (AddrLo != DstReg) {
2956 B.buildCast(DstReg, AddrLo);
2964 LLT Ty =
MRI.getType(DstReg);
2973 GV->
getName() !=
"llvm.amdgcn.module.lds") {
2976 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2986 B.buildUndef(DstReg);
2987 MI.eraseFromParent();
3007 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3011 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3012 B.buildIntToPtr(DstReg, Sz);
3013 MI.eraseFromParent();
3019 *cast<GlobalVariable>(GV)));
3020 MI.eraseFromParent();
3026 MI.eraseFromParent();
3034 MI.eraseFromParent();
3040 MI.eraseFromParent();
3045 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3058 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3059 B.buildExtract(DstReg, Load, 0);
3061 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3063 MI.eraseFromParent();
3081 LLT PtrTy =
MRI.getType(PtrReg);
3086 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3088 MI.getOperand(1).setReg(Cast.getReg(0));
3093 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3097 LLT ValTy =
MRI.getType(ValReg);
3119 if (WideMemSize == ValSize) {
3125 MI.setMemRefs(MF, {WideMMO});
3131 if (ValSize > WideMemSize)
3138 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3139 B.buildTrunc(ValReg, WideLoad).getReg(0);
3146 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3147 B.buildExtract(ValReg, WideLoad, 0);
3151 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3156 MI.eraseFromParent();
3169 Register DataReg =
MI.getOperand(0).getReg();
3170 LLT DataTy =
MRI.getType(DataReg);
3184 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3213 "this should not have been custom lowered");
3215 LLT ValTy =
MRI.getType(CmpVal);
3218 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3220 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3224 .setMemRefs(
MI.memoperands());
3226 MI.eraseFromParent();
3235 case TargetOpcode::G_INTRINSIC: {
3237 case Intrinsic::amdgcn_frexp_mant:
3245 case TargetOpcode::G_FFREXP: {
3250 case TargetOpcode::G_FPEXT: {
3274std::pair<Register, Register>
3276 unsigned Flags)
const {
3281 auto SmallestNormal =
B.buildFConstant(
3283 auto IsLtSmallestNormal =
3286 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3287 auto One =
B.buildFConstant(
F32, 1.0);
3289 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3290 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3292 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3305 LLT Ty =
B.getMRI()->getType(Dst);
3306 unsigned Flags =
MI.getFlags();
3311 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3312 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3313 .addUse(Ext.getReg(0))
3315 B.buildFPTrunc(Dst,
Log2, Flags);
3316 MI.eraseFromParent();
3324 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3327 MI.eraseFromParent();
3331 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3332 .addUse(ScaledInput)
3335 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3336 auto Zero =
B.buildFConstant(Ty, 0.0);
3338 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3339 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3341 MI.eraseFromParent();
3347 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3348 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3353 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3354 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3359 unsigned Flags =
MI.getFlags();
3360 const LLT Ty =
MRI.getType(
X);
3370 TM.Options.ApproxFuncFPMath ||
TM.Options.UnsafeFPMath) {
3373 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3375 B.buildFPTrunc(Dst, LogVal);
3380 MI.eraseFromParent();
3389 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3394 const float c_log10 = 0x1.344134p-2f;
3395 const float cc_log10 = 0x1.09f79ep-26f;
3398 const float c_log = 0x1.62e42ep-1f;
3399 const float cc_log = 0x1.efa39ep-25f;
3401 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3402 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3404 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3405 auto NegR =
B.buildFNeg(Ty, R, Flags);
3406 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3407 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3408 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3411 const float ch_log10 = 0x1.344000p-2f;
3412 const float ct_log10 = 0x1.3509f6p-18f;
3415 const float ch_log = 0x1.62e000p-1f;
3416 const float ct_log = 0x1.0bfbe8p-15f;
3418 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3419 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3421 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3422 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3423 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3424 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3427 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3429 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3432 const bool IsFiniteOnly =
3436 if (!IsFiniteOnly) {
3439 auto Fabs =
B.buildFAbs(Ty,
Y);
3442 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3446 auto Zero =
B.buildFConstant(Ty, 0.0);
3448 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3449 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3450 B.buildFSub(Dst, R, Shift, Flags);
3452 B.buildCopy(Dst, R);
3455 MI.eraseFromParent();
3461 unsigned Flags)
const {
3462 const double Log2BaseInverted =
3465 LLT Ty =
B.getMRI()->getType(Dst);
3470 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3473 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3474 auto Zero =
B.buildFConstant(Ty, 0.0);
3476 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3477 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3480 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3482 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3483 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3491 ?
B.buildFLog2(Ty, Src, Flags)
3492 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3495 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3496 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3507 unsigned Flags =
MI.getFlags();
3508 LLT Ty =
B.getMRI()->getType(Dst);
3514 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3515 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3516 .addUse(Ext.getReg(0))
3518 B.buildFPTrunc(Dst,
Log2, Flags);
3519 MI.eraseFromParent();
3529 MI.eraseFromParent();
3537 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3539 RangeCheckConst, Flags);
3541 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3542 auto Zero =
B.buildFConstant(Ty, 0.0);
3543 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3544 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3546 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3547 .addUse(AddInput.getReg(0))
3550 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3551 auto One =
B.buildFConstant(Ty, 1.0);
3552 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3553 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3554 MI.eraseFromParent();
3560 LLT Ty =
B.getMRI()->getType(Dst);
3565 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3569 .addUse(
Mul.getReg(0))
3572 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3578 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3581 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3582 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3583 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3586 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3588 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3589 .addUse(ExpInput.getReg(0))
3592 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3593 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3594 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3602 const unsigned Flags =
MI.getFlags();
3605 LLT Ty =
MRI.getType(Dst);
3608 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3615 MI.eraseFromParent();
3623 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3626 B.buildFPTrunc(Dst, Lowered, Flags);
3627 MI.eraseFromParent();
3637 MI.eraseFromParent();
3665 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3670 const float cc_exp = 0x1.4ae0bep-26f;
3671 const float c_exp10 = 0x1.a934f0p+1f;
3672 const float cc_exp10 = 0x1.2f346ep-24f;
3674 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3675 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3676 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3677 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3679 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3680 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3682 const float ch_exp = 0x1.714000p+0f;
3683 const float cl_exp = 0x1.47652ap-12f;
3685 const float ch_exp10 = 0x1.a92000p+1f;
3686 const float cl_exp10 = 0x1.4f0978p-11f;
3688 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3689 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3690 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3692 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3693 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3695 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3696 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3699 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3700 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3703 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3706 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3707 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3710 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3711 .addUse(
A.getReg(0))
3713 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3715 auto UnderflowCheckConst =
3716 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3717 auto Zero =
B.buildFConstant(Ty, 0.0);
3721 R =
B.buildSelect(Ty, Underflow, Zero, R);
3726 auto OverflowCheckConst =
3727 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3732 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3735 B.buildCopy(Dst, R);
3736 MI.eraseFromParent();
3745 unsigned Flags =
MI.getFlags();
3746 LLT Ty =
B.getMRI()->getType(Dst);
3751 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3752 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3753 .addUse(Log.getReg(0))
3756 B.buildFExp2(Dst,
Mul, Flags);
3757 }
else if (Ty == F16) {
3759 auto Log =
B.buildFLog2(F16, Src0, Flags);
3760 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3761 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3762 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3763 .addUse(Ext0.getReg(0))
3764 .addUse(Ext1.getReg(0))
3766 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3770 MI.eraseFromParent();
3778 ModSrc = SrcFNeg->getOperand(1).getReg();
3780 ModSrc = SrcFAbs->getOperand(1).getReg();
3782 ModSrc = SrcFAbs->getOperand(1).getReg();
3793 Register OrigSrc =
MI.getOperand(1).getReg();
3794 unsigned Flags =
MI.getFlags();
3796 "this should not have been custom lowered");
3806 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3818 B.buildFConstant(
F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3826 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3828 B.buildFMinNum(Min, Fract, Const, Flags);
3833 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3836 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3837 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3839 MI.eraseFromParent();
3855 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3857 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3858 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3861 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3862 B.buildBitcast(Dst,
Merge);
3864 MI.eraseFromParent();
3881 bool UsePartialMad64_32,
3882 bool SeparateOddAlignedProducts)
const {
3897 auto getZero32 = [&]() ->
Register {
3899 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3902 auto getZero64 = [&]() ->
Register {
3904 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3909 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3920 if (CarryIn.empty())
3923 bool HaveCarryOut =
true;
3925 if (CarryIn.size() == 1) {
3927 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3931 CarryAccum = getZero32();
3933 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3934 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3936 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3941 LocalAccum = getZero32();
3942 HaveCarryOut =
false;
3947 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3948 LocalAccum =
Add.getReg(0);
3962 auto buildMadChain =
3965 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3966 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3973 if (LocalAccum.size() == 1 &&
3974 (!UsePartialMad64_32 || !CarryIn.empty())) {
3977 unsigned j1 = DstIndex - j0;
3978 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3982 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3984 LocalAccum[0] =
Mul.getReg(0);
3986 if (CarryIn.empty()) {
3987 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3990 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
3996 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4000 if (j0 <= DstIndex) {
4001 bool HaveSmallAccum =
false;
4004 if (LocalAccum[0]) {
4005 if (LocalAccum.size() == 1) {
4006 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4007 HaveSmallAccum =
true;
4008 }
else if (LocalAccum[1]) {
4009 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4010 HaveSmallAccum =
false;
4012 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4013 HaveSmallAccum =
true;
4016 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4018 HaveSmallAccum =
true;
4022 unsigned j1 = DstIndex - j0;
4023 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4027 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4028 {Src0[j0], Src1[j1], Tmp});
4029 Tmp = Mad.getReg(0);
4030 if (!HaveSmallAccum)
4031 CarryOut.push_back(Mad.getReg(1));
4032 HaveSmallAccum =
false;
4035 }
while (j0 <= DstIndex);
4037 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4038 LocalAccum[0] = Unmerge.getReg(0);
4039 if (LocalAccum.size() > 1)
4040 LocalAccum[1] = Unmerge.getReg(1);
4067 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4068 Carry OddCarryIn = std::move(OddCarry);
4069 Carry EvenCarryIn = std::move(EvenCarry);
4074 if (2 * i < Accum.
size()) {
4075 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4076 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4081 if (!SeparateOddAlignedProducts) {
4082 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4083 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4085 bool IsHighest = 2 * i >= Accum.
size();
4089 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4095 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4097 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4099 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4102 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4105 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4106 Lo->getOperand(1).getReg());
4107 Accum[2 * i] =
Hi.getReg(0);
4108 SeparateOddCarry =
Hi.getReg(1);
4115 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4116 EvenCarryIn.push_back(CarryOut);
4118 if (2 * i < Accum.
size()) {
4119 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4120 OddCarry.push_back(CarryOut);
4133 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4142 LLT Ty =
MRI.getType(DstReg);
4146 unsigned NumParts =
Size / 32;
4162 for (
unsigned i = 0; i < NumParts; ++i) {
4166 B.buildUnmerge(Src0Parts, Src0);
4167 B.buildUnmerge(Src1Parts, Src1);
4170 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4171 SeparateOddAlignedProducts);
4173 B.buildMergeLikeInstr(DstReg, AccumRegs);
4174 MI.eraseFromParent();
4186 LLT DstTy =
MRI.getType(Dst);
4187 LLT SrcTy =
MRI.getType(Src);
4189 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4190 ? AMDGPU::G_AMDGPU_FFBH_U32
4191 : AMDGPU::G_AMDGPU_FFBL_B32;
4192 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4195 MI.eraseFromParent();
4204 LLT SrcTy =
MRI.getType(Src);
4209 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4210 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4211 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4212 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4213 B.buildTrunc(Dst, Ctlz);
4214 MI.eraseFromParent();
4220 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4223 return ConstVal && *ConstVal == -1;
4230 Register CondDef =
MI.getOperand(0).getReg();
4231 if (!
MRI.hasOneNonDBGUse(CondDef))
4239 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4245 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4254 if (Next == Parent->
end()) {
4258 UncondBrTarget = &*NextMBB;
4260 if (Next->getOpcode() != AMDGPU::G_BR)
4278 *ArgRC,
B.getDebugLoc(), ArgTy);
4282 const unsigned Mask = Arg->
getMask();
4283 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4290 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4291 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4294 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4296 B.buildCopy(DstReg, LiveIn);
4325 Arg = &WorkGroupIDX;
4326 ArgRC = &AMDGPU::SReg_32RegClass;
4330 Arg = &WorkGroupIDY;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4335 Arg = &WorkGroupIDZ;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4351 B.buildConstant(DstReg, 0);
4357 B.buildUndef(DstReg);
4361 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4372 MI.eraseFromParent();
4378 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4379 MI.eraseFromParent();
4400 B.buildUndef(DstReg);
4401 MI.eraseFromParent();
4405 if (Arg->isMasked()) {
4419 MI.eraseFromParent();
4426 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4436 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4444 Align Alignment)
const {
4448 "unexpected kernarg parameter type");
4452 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4455 MI.eraseFromParent();
4463 LLT DstTy =
MRI.getType(Dst);
4490 auto FloatY =
B.buildUITOFP(
S32,
Y);
4491 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4492 auto Scale =
B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f7ffffe));
4493 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4494 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4497 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4498 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4499 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4502 auto Q =
B.buildUMulH(
S32,
X, Z);
4503 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4506 auto One =
B.buildConstant(
S32, 1);
4509 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4515 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4518 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4537 auto Unmerge =
B.buildUnmerge(
S32, Val);
4539 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4540 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4542 auto Mad =
B.buildFMAD(
4544 B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4546 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4547 auto Mul1 =
B.buildFMul(
4548 S32, Rcp,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x5f7ffffc)));
4551 auto Mul2 =
B.buildFMul(
4552 S32, Mul1,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x2f800000)));
4553 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4556 auto Mad2 =
B.buildFMAD(
4557 S32, Trunc,
B.buildFConstant(
S32, llvm::bit_cast<float>(0xcf800000)),
4560 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4561 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4563 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4578 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4580 auto Zero64 =
B.buildConstant(
S64, 0);
4581 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4583 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4584 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4586 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4587 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4588 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4590 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4591 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4592 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4594 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4595 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4596 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4597 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4598 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4600 auto Zero32 =
B.buildConstant(
S32, 0);
4601 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4602 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4603 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4605 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4606 Register NumerLo = UnmergeNumer.getReg(0);
4607 Register NumerHi = UnmergeNumer.getReg(1);
4609 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4610 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4611 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4612 Register Mul3_Lo = UnmergeMul3.getReg(0);
4613 Register Mul3_Hi = UnmergeMul3.getReg(1);
4614 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4615 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4616 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4617 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4619 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4620 Register DenomLo = UnmergeDenom.getReg(0);
4621 Register DenomHi = UnmergeDenom.getReg(1);
4624 auto C1 =
B.buildSExt(
S32, CmpHi);
4627 auto C2 =
B.buildSExt(
S32, CmpLo);
4630 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4637 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4638 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4639 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4640 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4642 auto One64 =
B.buildConstant(
S64, 1);
4643 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4649 auto C6 =
B.buildSelect(
4653 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4654 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4656 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4657 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4658 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4664 auto Sel1 =
B.buildSelect(
4671 auto Sel2 =
B.buildSelect(
4682 switch (
MI.getOpcode()) {
4685 case AMDGPU::G_UDIV: {
4686 DstDivReg =
MI.getOperand(0).getReg();
4689 case AMDGPU::G_UREM: {
4690 DstRemReg =
MI.getOperand(0).getReg();
4693 case AMDGPU::G_UDIVREM: {
4694 DstDivReg =
MI.getOperand(0).getReg();
4695 DstRemReg =
MI.getOperand(1).getReg();
4702 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4703 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4704 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4705 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4714 MI.eraseFromParent();
4724 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4725 if (Ty !=
S32 && Ty !=
S64)
4728 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4733 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4734 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4736 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4737 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4739 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4740 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4742 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4743 switch (
MI.getOpcode()) {
4746 case AMDGPU::G_SDIV: {
4747 DstDivReg =
MI.getOperand(0).getReg();
4748 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4751 case AMDGPU::G_SREM: {
4752 DstRemReg =
MI.getOperand(0).getReg();
4753 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4756 case AMDGPU::G_SDIVREM: {
4757 DstDivReg =
MI.getOperand(0).getReg();
4758 DstRemReg =
MI.getOperand(1).getReg();
4759 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4760 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4771 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4772 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4773 B.buildSub(DstDivReg, SignXor, Sign);
4777 auto Sign = LHSign.getReg(0);
4778 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4779 B.buildSub(DstRemReg, SignXor, Sign);
4782 MI.eraseFromParent();
4793 LLT ResTy =
MRI.getType(Res);
4800 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4811 if (CLHS->isExactlyValue(1.0)) {
4812 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4816 MI.eraseFromParent();
4821 if (CLHS->isExactlyValue(-1.0)) {
4822 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4823 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4824 .addUse(FNeg.getReg(0))
4827 MI.eraseFromParent();
4834 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4839 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4842 B.buildFMul(Res,
LHS, RCP, Flags);
4844 MI.eraseFromParent();
4855 LLT ResTy =
MRI.getType(Res);
4861 if (!AllowInaccurateRcp)
4864 auto NegY =
B.buildFNeg(ResTy,
Y);
4865 auto One =
B.buildFConstant(ResTy, 1.0);
4867 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4871 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4872 R =
B.buildFMA(ResTy, Tmp0, R, R);
4874 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4875 R =
B.buildFMA(ResTy, Tmp1, R, R);
4877 auto Ret =
B.buildFMul(ResTy,
X, R);
4878 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4880 B.buildFMA(Res, Tmp2, R, Ret);
4881 MI.eraseFromParent();
4900 auto LHSExt =
B.buildFPExt(
S32,
LHS, Flags);
4901 auto RHSExt =
B.buildFPExt(
S32,
RHS, Flags);
4903 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4904 .addUse(RHSExt.getReg(0))
4907 auto QUOT =
B.buildFMul(
S32, LHSExt, RCP, Flags);
4908 auto RDst =
B.buildFPTrunc(
S16, QUOT, Flags);
4910 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4911 .addUse(RDst.getReg(0))
4916 MI.eraseFromParent();
4929 unsigned SPDenormMode =
4932 if (ST.hasDenormModeInst()) {
4934 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4936 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4937 B.buildInstr(AMDGPU::S_DENORM_MODE)
4938 .addImm(NewDenormModeValue);
4941 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4942 .addImm(SPDenormMode)
4964 auto One =
B.buildFConstant(
S32, 1.0f);
4966 auto DenominatorScaled =
4967 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4972 auto NumeratorScaled =
4973 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4979 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4980 .addUse(DenominatorScaled.getReg(0))
4982 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
4985 const bool HasDynamicDenormals =
4990 if (!PreservesDenormals) {
4991 if (HasDynamicDenormals) {
4992 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4993 B.buildInstr(AMDGPU::S_GETREG_B32)
4994 .addDef(SavedSPDenormMode)
5000 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5001 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5002 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5003 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5004 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5005 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5007 if (!PreservesDenormals) {
5008 if (HasDynamicDenormals) {
5009 assert(SavedSPDenormMode);
5010 B.buildInstr(AMDGPU::S_SETREG_B32)
5011 .addReg(SavedSPDenormMode)
5017 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5018 .addUse(Fma4.getReg(0))
5019 .addUse(Fma1.getReg(0))
5020 .addUse(Fma3.getReg(0))
5021 .addUse(NumeratorScaled.getReg(1))
5024 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5025 .addUse(Fmas.getReg(0))
5030 MI.eraseFromParent();
5049 auto One =
B.buildFConstant(
S64, 1.0);
5051 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5057 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5059 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5060 .addUse(DivScale0.getReg(0))
5063 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5064 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5065 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5067 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5073 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5074 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5075 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5084 auto NumUnmerge =
B.buildUnmerge(
S32,
LHS);
5085 auto DenUnmerge =
B.buildUnmerge(
S32,
RHS);
5086 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5087 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5090 Scale1Unmerge.getReg(1));
5092 Scale0Unmerge.getReg(1));
5093 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5095 Scale = DivScale1.getReg(1);
5098 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5099 .addUse(Fma4.getReg(0))
5100 .addUse(Fma3.getReg(0))
5101 .addUse(
Mul.getReg(0))
5105 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5106 .addUse(Fmas.getReg(0))
5111 MI.eraseFromParent();
5123 LLT Ty =
MRI.getType(Res0);
5126 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5129 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5134 auto Fabs =
B.buildFAbs(Ty, Val);
5138 auto Zero =
B.buildConstant(InstrExpTy, 0);
5139 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5140 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5143 B.buildCopy(Res0, Mant);
5144 B.buildSExtOrTrunc(Res1, Exp);
5146 MI.eraseFromParent();
5161 auto Abs =
B.buildFAbs(
S32,
RHS, Flags);
5164 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5165 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5166 auto C2 =
B.buildFConstant(
S32, 1.0f);
5169 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5171 auto Mul0 =
B.buildFMul(
S32,
RHS, Sel, Flags);
5173 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5174 .addUse(Mul0.getReg(0))
5177 auto Mul1 =
B.buildFMul(
S32,
LHS, RCP, Flags);
5179 B.buildFMul(Res, Sel, Mul1, Flags);
5181 MI.eraseFromParent();
5190 unsigned Flags =
MI.getFlags();
5193 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5194 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5195 .addUse(Ext.getReg(0))
5197 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5198 MI.eraseFromParent();
5208 const unsigned Flags =
MI.getFlags();
5217 MI.eraseFromParent();
5221 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5223 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5224 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5225 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5230 .addUse(SqrtX.getReg(0))
5233 auto NegOne =
B.buildConstant(I32, -1);
5234 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5236 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5237 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5239 auto PosOne =
B.buildConstant(I32, 1);
5240 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5242 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5243 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5245 auto Zero =
B.buildFConstant(
F32, 0.0f);
5249 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5253 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5256 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5257 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5259 auto Half =
B.buildFConstant(
F32, 0.5f);
5260 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5261 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5262 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5263 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5264 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5265 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5266 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5267 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5270 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5272 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5274 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5277 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5279 MI.eraseFromParent();
5311 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5314 unsigned Flags =
MI.getFlags();
5316 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5318 auto ZeroInt =
B.buildConstant(
S32, 0);
5322 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5323 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5324 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5327 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5329 auto Half =
B.buildFConstant(
F64, 0.5);
5330 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5331 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5333 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5334 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5336 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5337 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5339 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5340 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5342 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5344 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5345 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5347 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5350 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5351 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5352 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5361 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5363 MI.eraseFromParent();
5370 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5394 auto Flags =
MI.getFlags();
5396 LLT Ty =
MRI.getType(Dst);
5406 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5416 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5417 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5422 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5424 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5425 MI.eraseFromParent();
5437 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5438 IID == Intrinsic::amdgcn_permlanex16;
5442 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5444 case Intrinsic::amdgcn_readfirstlane:
5445 case Intrinsic::amdgcn_permlane64:
5446 return LaneOp.getReg(0);
5447 case Intrinsic::amdgcn_readlane:
5448 return LaneOp.addUse(Src1).getReg(0);
5449 case Intrinsic::amdgcn_writelane:
5450 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5451 case Intrinsic::amdgcn_permlane16:
5452 case Intrinsic::amdgcn_permlanex16: {
5456 return LaneOp.addUse(Src1)
5471 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5473 Src1 =
MI.getOperand(3).getReg();
5474 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5475 Src2 =
MI.getOperand(4).getReg();
5479 LLT Ty =
MRI.getType(DstReg);
5488 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5493 if (IID == Intrinsic::amdgcn_writelane)
5496 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5497 B.buildTrunc(DstReg, LaneOpDst);
5498 MI.eraseFromParent();
5513 PartialResTy = EltTy;
5522 unsigned NumParts =
Size / 32;
5527 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5529 if (IID == Intrinsic::amdgcn_writelane)
5530 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5532 for (
unsigned i = 0; i < NumParts; ++i) {
5533 Src0 = Src0Parts.
getReg(i);
5536 Src1 = Src1Parts.
getReg(i);
5538 if (IID == Intrinsic::amdgcn_writelane)
5539 Src2 = Src2Parts.
getReg(i);
5541 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5544 B.buildMergeLikeInstr(DstReg, PartialRes);
5545 MI.eraseFromParent();
5555 LLT DstTy =
MRI.getType(DstReg);
5558 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5564 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
5575 Register Pointer =
MI.getOperand(2).getReg();
5577 Register NumRecords =
MI.getOperand(4).getReg();
5582 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5583 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5584 Register LowHalf = Unmerge.getReg(0);
5585 Register HighHalf = Unmerge.getReg(1);
5587 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5588 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5591 std::optional<ValueAndVReg> StrideConst =
5593 if (!StrideConst || !StrideConst->Value.isZero()) {
5596 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5597 uint32_t ShiftedStrideVal = StrideVal << 16;
5598 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5600 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5601 auto ShiftConst =
B.buildConstant(
S32, 16);
5602 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5604 NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
5607 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5608 MI.eraseFromParent();
5625 MI.eraseFromParent();
5633 std::optional<uint32_t> KnownSize =
5635 if (KnownSize.has_value())
5636 B.buildConstant(DstReg, *KnownSize);
5654 MI.eraseFromParent();
5661 unsigned AddrSpace)
const {
5663 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
5667 MI.eraseFromParent();
5677std::pair<Register, unsigned>
5686 std::tie(BaseReg, ImmOffset) =
5690 if (
MRI.getType(BaseReg).isPointer())
5691 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
5701 unsigned Overflow = ImmOffset & ~MaxImm;
5702 ImmOffset -= Overflow;
5703 if ((int32_t)Overflow < 0) {
5704 Overflow += ImmOffset;
5708 if (Overflow != 0) {
5710 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
5712 auto OverflowVal =
B.buildConstant(
S32, Overflow);
5713 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
5718 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
5720 return std::pair(BaseReg, ImmOffset);
5727 bool ImageStore)
const {
5730 LLT StoreVT =
MRI.getType(Reg);
5734 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5737 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5738 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
5749 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
5751 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
5758 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5759 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5761 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
5769 auto Unmerge =
B.buildUnmerge(
S32, Reg);
5770 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5772 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
5790 LLT Ty =
MRI->getType(VData);
5818 bool IsFormat)
const {
5820 LLT Ty =
MRI.getType(VData);
5822 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5835 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5838 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
5842 VIndex =
MI.getOperand(3).getReg();
5845 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5848 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5849 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5853 Format =
MI.getOperand(5 + OpOffset).getImm();
5857 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5863 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5864 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5865 }
else if (IsFormat) {
5866 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5867 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5871 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5874 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5877 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5882 auto MIB =
B.buildInstr(Opc)
5893 MIB.addImm(AuxiliaryData)
5894 .addImm(HasVIndex ? -1 : 0)
5895 .addMemOperand(MMO);
5897 MI.eraseFromParent();
5903 unsigned ImmOffset,
unsigned Format,
5906 auto MIB =
B.buildInstr(Opc)
5917 MIB.addImm(AuxiliaryData)
5918 .addImm(HasVIndex ? -1 : 0)
5919 .addMemOperand(MMO);
5926 bool IsTyped)
const {
5936 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
5937 bool IsTFE =
MI.getNumExplicitDefs() == 2;
5939 StatusDst =
MI.getOperand(1).getReg();
5944 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
5947 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5950 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
5953 VIndex =
MI.getOperand(3 + OpOffset).getReg();
5956 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5959 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5960 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5964 Format =
MI.getOperand(5 + OpOffset).getImm();
5968 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5971 LLT Ty =
MRI.getType(Dst);
5976 Dst =
MI.getOperand(0).getReg();
5979 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5990 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5991 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5992 }
else if (IsFormat) {
5996 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5998 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5999 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6004 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6005 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6008 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6009 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6012 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6013 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6020 unsigned NumLoadDWords = NumValueDWords + 1;
6022 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6023 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6024 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6026 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6027 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6028 B.buildTrunc(Dst, ExtDst);
6029 }
else if (NumValueDWords == 1) {
6030 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6033 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6034 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6036 B.buildUnmerge(LoadElts, LoadDstReg);
6038 B.buildMergeLikeInstr(Dst, LoadElts);
6042 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6043 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6044 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6045 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6046 B.buildTrunc(Dst, LoadDstReg);
6047 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
6049 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6050 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6051 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6052 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6054 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6056 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6057 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6058 B.buildMergeLikeInstr(Dst, Repack);
6061 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6064 MI.eraseFromParent();
6070 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6072 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6075 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6141 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6142 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6144 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6145 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6146 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6147 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6148 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6149 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6150 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6151 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6152 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6161 const bool IsCmpSwap =
6162 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6163 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6164 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6165 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6176 CmpVal =
MI.getOperand(3).getReg();
6181 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6182 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6185 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6188 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6191 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6194 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6195 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6196 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6215 .addImm(AuxiliaryData)
6216 .addImm(HasVIndex ? -1 : 0)
6217 .addMemOperand(MMO);
6219 MI.eraseFromParent();
6229 bool IsA16,
bool IsG16) {
6232 auto EndIdx =
Intr->VAddrEnd;
6234 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
6241 if ((I < Intr->GradientStart) ||
6242 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6243 (
I >=
Intr->CoordStart && !IsA16)) {
6244 if ((I < Intr->GradientStart) && IsA16 &&
6245 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6246 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
6250 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6254 "Bias needs to be converted to 16 bit in A16 mode");
6256 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6262 if (((
I + 1) >= EndIdx) ||
6263 ((
Intr->NumGradients / 2) % 2 == 1 &&
6264 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
6265 (
Intr->NumGradients / 2) - 1) ||
6266 I ==
static_cast<unsigned>(
Intr->GradientStart +
6267 Intr->NumGradients - 1))) ||
6269 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6271 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6276 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6287 int DimIdx,
int NumVAddrs) {
6291 for (
int I = 0;
I != NumVAddrs; ++
I) {
6293 if (
SrcOp.isReg()) {
6299 int NumAddrRegs = AddrRegs.
size();
6300 if (NumAddrRegs != 1) {
6303 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6306 for (
int I = 1;
I != NumVAddrs; ++
I) {
6309 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6331 const unsigned NumDefs =
MI.getNumExplicitDefs();
6332 const unsigned ArgOffset = NumDefs + 1;
6333 bool IsTFE = NumDefs == 2;
6351 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6352 Ty =
MRI->getType(VData);
6355 const bool IsAtomicPacked16Bit =
6356 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6357 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6361 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
6363 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
6366 const bool IsA16 = AddrTy ==
S16;
6370 if (!BaseOpcode->
Atomic) {
6371 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
6374 }
else if (DMask != 0) {
6376 }
else if (!IsTFE && !BaseOpcode->
Store) {
6378 B.buildUndef(
MI.getOperand(0));
6379 MI.eraseFromParent();
6387 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6388 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6389 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6390 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6391 unsigned NewOpcode = LoadOpcode;
6392 if (BaseOpcode->
Store)
6393 NewOpcode = StoreOpcode;
6395 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6398 MI.setDesc(
B.getTII().get(NewOpcode));
6402 if (IsTFE && DMask == 0) {
6405 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
6408 if (BaseOpcode->
Atomic) {
6410 LLT Ty =
MRI->getType(VData0);
6413 if (Ty.
isVector() && !IsAtomicPacked16Bit)
6420 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6421 MI.getOperand(2).setReg(
Concat.getReg(0));
6422 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6426 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
6435 if (IsA16 && !ST.
hasA16()) {
6443 if (IsA16 || IsG16) {
6453 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6454 const bool UsePartialNSA =
6455 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6457 if (UsePartialNSA) {
6461 auto Concat =
B.buildConcatVectors(
6462 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6463 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6464 PackedRegs.
resize(NSAMaxSize);
6465 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6467 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6468 PackedRegs[0] =
Concat.getReg(0);
6472 const unsigned NumPacked = PackedRegs.
size();
6473 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
6475 if (!
SrcOp.isReg()) {
6482 if (
I -
Intr->VAddrStart < NumPacked)
6483 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
6485 SrcOp.setReg(AMDGPU::NoRegister);
6504 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6505 const bool UsePartialNSA =
6506 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6508 if (UsePartialNSA) {
6510 ArgOffset +
Intr->VAddrStart + NSAMaxSize - 1,
6511 Intr->NumVAddrs - NSAMaxSize + 1);
6512 }
else if (!UseNSA &&
Intr->NumVAddrs > 1) {
6531 if (RepackedReg != VData) {
6532 MI.getOperand(1).setReg(RepackedReg);
6543 if (NumElts < DMaskLanes)
6546 if (NumElts > 4 || DMaskLanes > 4)
6556 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6557 const LLT AdjustedTy =
6580 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6581 unsigned RoundedSize = 32 * RoundedElts;
6585 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6590 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6596 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6600 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6601 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6603 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6605 MI.getOperand(0).setReg(NewResultReg);
6613 Dst1Reg =
MI.getOperand(1).getReg();
6614 if (
MRI->getType(Dst1Reg) !=
S32)
6618 MI.removeOperand(1);
6622 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6631 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6633 if (ResultNumRegs == 1) {
6635 ResultRegs[0] = NewResultReg;
6638 for (
int I = 0;
I != NumDataRegs; ++
I)
6639 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
6640 B.buildUnmerge(ResultRegs, NewResultReg);
6645 ResultRegs.
resize(NumDataRegs);
6651 B.buildTrunc(DstReg, ResultRegs[0]);
6657 B.buildBitcast(DstReg, ResultRegs[0]);
6671 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
6674 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
6678 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
6681 Register Undef =
B.buildUndef(Ty).getReg(0);
6682 for (
int I = 0;
I != NumElts; ++
I)
6687 LLT ResTy =
MRI->getType(ResultRegs[0]);
6689 padWithUndef(ResTy, NumElts - ResultRegs.
size());
6690 B.buildBuildVector(DstReg, ResultRegs);
6701 if (ResultRegs.
size() == 1) {
6702 NewResultReg = ResultRegs[0];
6703 }
else if (ResultRegs.
size() == 2) {
6705 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
6711 if (
MRI->getType(DstReg).getNumElements() <
6712 MRI->getType(NewResultReg).getNumElements()) {
6713 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6715 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6720 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
6721 B.buildConcatVectors(DstReg, ResultRegs);
6730 Register OrigDst =
MI.getOperand(0).getReg();
6732 LLT Ty =
B.getMRI()->getType(OrigDst);
6738 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6739 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6742 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
6744 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6753 B.setInsertPt(
B.getMBB(),
MI);
6758 B.setInsertPt(
B.getMBB(),
MI);
6764 MI.setDesc(
B.getTII().get(Opc));
6765 MI.removeOperand(1);
6768 const unsigned MemSize = (
Size + 7) / 8;
6769 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
6776 MI.addMemOperand(MF, MMO);
6777 if (Dst != OrigDst) {
6778 MI.getOperand(0).setReg(Dst);
6779 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6780 B.buildTrunc(OrigDst, Dst);
6818 MI.eraseFromParent();
6828 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
6830 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6834 MI.eraseFromParent();
6843 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6852 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
6868 Register LoadAddr =
MRI.createGenericVirtualRegister(
6870 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6873 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
6874 B.buildCopy(SGPR01, Temp);
6875 B.buildInstr(AMDGPU::S_TRAP)
6878 MI.eraseFromParent();
6889 B.buildCopy(SGPR01, LiveIn);
6890 B.buildInstr(AMDGPU::S_TRAP)
6894 MI.eraseFromParent();
6906 MI.eraseFromParent();
6910 B.buildInstr(AMDGPU::S_TRAP)
6912 MI.eraseFromParent();
6924 "debugtrap handler not supported",
6926 LLVMContext &Ctx =
B.getMF().getFunction().getContext();
6930 B.buildInstr(AMDGPU::S_TRAP)
6934 MI.eraseFromParent();
6947 Register NodePtr =
MI.getOperand(2).getReg();
6948 Register RayExtent =
MI.getOperand(3).getReg();
6949 Register RayOrigin =
MI.getOperand(4).getReg();
6951 Register RayInvDir =
MI.getOperand(6).getReg();
6956 "intrinsic not supported on subtarget",
6958 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6965 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6966 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
6967 const unsigned NumVDataDwords = 4;
6968 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6969 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6973 const unsigned BaseOpcodes[2][2] = {
6974 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6975 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6976 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6980 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6981 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6982 : AMDGPU::MIMGEncGfx10NSA,
6983 NumVDataDwords, NumVAddrDwords);
6987 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6988 : AMDGPU::MIMGEncGfx10Default,
6989 NumVDataDwords, NumVAddrDwords);
6994 if (UseNSA && IsGFX11Plus) {
6996 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
6997 auto Merged =
B.buildMergeLikeInstr(
6998 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7004 packLanes(RayOrigin);
7007 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7008 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7009 auto MergedDir =
B.buildMergeLikeInstr(
7012 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7013 UnmergeRayDir.getReg(0)}))
7016 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7017 UnmergeRayDir.getReg(1)}))
7020 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7021 UnmergeRayDir.getReg(2)}))
7026 packLanes(RayInvDir);
7030 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7039 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7045 packLanes(RayOrigin);
7047 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7048 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7052 B.buildMergeLikeInstr(R1,
7053 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7054 B.buildMergeLikeInstr(
7055 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7056 B.buildMergeLikeInstr(
7057 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7063 packLanes(RayInvDir);
7070 Register MergedOps =
B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7075 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7084 .addImm(IsA16 ? 1 : 0)
7087 MI.eraseFromParent();
7094 int RoundMode =
MI.getOperand(2).getImm();
7097 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7099 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7104 .addDef(
MI.getOperand(0).getReg())
7105 .addUse(
MI.getOperand(1).getReg());
7107 MI.eraseFromParent();
7117 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7118 MI.eraseFromParent();
7129 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7130 auto LSB =
B.buildConstant(
S32, 25);
7131 auto Width =
B.buildConstant(
S32, 5);
7132 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7133 MI.eraseFromParent();
7147 if (
MRI.getType(Src) !=
S64)
7151 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7155 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7158 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7159 MI.eraseFromParent();
7167 if (
MRI.getType(Src) !=
S64)
7170 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7174 .addReg(Unmerge.getReg(0));
7178 .addReg(Unmerge.getReg(1));
7179 MI.eraseFromParent();
7189 auto IntrID = cast<GIntrinsic>(
MI).getIntrinsicID();
7191 case Intrinsic::amdgcn_if:
7192 case Intrinsic::amdgcn_else: {
7195 bool Negated =
false;
7207 std::swap(CondBrTarget, UncondBrTarget);
7209 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7210 if (IntrID == Intrinsic::amdgcn_if) {
7211 B.buildInstr(AMDGPU::SI_IF)
7214 .addMBB(UncondBrTarget);
7216 B.buildInstr(AMDGPU::SI_ELSE)
7219 .addMBB(UncondBrTarget);
7228 B.buildBr(*CondBrTarget);
7231 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7232 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7233 MI.eraseFromParent();
7234 BrCond->eraseFromParent();
7240 case Intrinsic::amdgcn_loop: {
7243 bool Negated =
false;
7253 std::swap(CondBrTarget, UncondBrTarget);
7255 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7256 B.buildInstr(AMDGPU::SI_LOOP)
7258 .addMBB(UncondBrTarget);
7263 B.buildBr(*CondBrTarget);
7265 MI.eraseFromParent();
7266 BrCond->eraseFromParent();
7267 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7273 case Intrinsic::amdgcn_addrspacecast_nonnull:
7275 case Intrinsic::amdgcn_make_buffer_rsrc:
7277 case Intrinsic::amdgcn_kernarg_segment_ptr:
7280 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7281 MI.eraseFromParent();
7287 case Intrinsic::amdgcn_implicitarg_ptr:
7289 case Intrinsic::amdgcn_workitem_id_x:
7292 case Intrinsic::amdgcn_workitem_id_y:
7295 case Intrinsic::amdgcn_workitem_id_z:
7298 case Intrinsic::amdgcn_workgroup_id_x:
7301 case Intrinsic::amdgcn_workgroup_id_y:
7304 case Intrinsic::amdgcn_workgroup_id_z:
7307 case Intrinsic::amdgcn_wave_id:
7309 case Intrinsic::amdgcn_lds_kernel_id:
7312 case Intrinsic::amdgcn_dispatch_ptr:
7315 case Intrinsic::amdgcn_queue_ptr:
7318 case Intrinsic::amdgcn_implicit_buffer_ptr:
7321 case Intrinsic::amdgcn_dispatch_id:
7324 case Intrinsic::r600_read_ngroups_x:
7328 case Intrinsic::r600_read_ngroups_y:
7331 case Intrinsic::r600_read_ngroups_z:
7334 case Intrinsic::r600_read_local_size_x:
7337 case Intrinsic::r600_read_local_size_y:
7341 case Intrinsic::r600_read_local_size_z:
7343 case Intrinsic::r600_read_global_size_x:
7345 case Intrinsic::r600_read_global_size_y:
7347 case Intrinsic::r600_read_global_size_z:
7349 case Intrinsic::amdgcn_fdiv_fast:
7351 case Intrinsic::amdgcn_is_shared:
7353 case Intrinsic::amdgcn_is_private:
7355 case Intrinsic::amdgcn_wavefrontsize: {
7357 MI.eraseFromParent();
7360 case Intrinsic::amdgcn_s_buffer_load:
7362 case Intrinsic::amdgcn_raw_buffer_store:
7363 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7364 case Intrinsic::amdgcn_struct_buffer_store:
7365 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7367 case Intrinsic::amdgcn_raw_buffer_store_format:
7368 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7369 case Intrinsic::amdgcn_struct_buffer_store_format:
7370 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7372 case Intrinsic::amdgcn_raw_tbuffer_store:
7373 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7374 case Intrinsic::amdgcn_struct_tbuffer_store:
7375 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7377 case Intrinsic::amdgcn_raw_buffer_load:
7378 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7379 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7380 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7381 case Intrinsic::amdgcn_struct_buffer_load:
7382 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7383 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7384 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7386 case Intrinsic::amdgcn_raw_buffer_load_format:
7387 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7388 case Intrinsic::amdgcn_struct_buffer_load_format:
7389 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7391 case Intrinsic::amdgcn_raw_tbuffer_load:
7392 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7393 case Intrinsic::amdgcn_struct_tbuffer_load:
7394 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7396 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7398 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7400 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7402 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7404 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7406 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7408 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7410 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7412 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7414 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7416 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7417 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7418 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7420 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7422 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7424 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7426 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7427 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7428 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7430 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7432 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7433 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7434 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7436 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7438 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7440 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7442 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7444 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7446 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7448 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7449 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7450 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7451 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7452 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7454 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7455 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7456 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7458 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7459 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7461 case Intrinsic::amdgcn_rsq_clamp:
7463 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7465 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7466 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7467 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7468 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7469 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7470 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7471 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7472 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7476 MI.getOperand(5).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7479 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7480 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7481 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7485 MI.getOperand(7).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7488 case Intrinsic::amdgcn_fmed3: {
7494 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7495 MI.removeOperand(1);
7499 case Intrinsic::amdgcn_readlane:
7500 case Intrinsic::amdgcn_writelane:
7501 case Intrinsic::amdgcn_readfirstlane:
7502 case Intrinsic::amdgcn_permlane16:
7503 case Intrinsic::amdgcn_permlanex16:
7504 case Intrinsic::amdgcn_permlane64:
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static constexpr unsigned MaxRegisterSize
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
bool hasScalarSubwordLoads() const
bool supportsGetDoorbellID() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
bool hasNSAEncoding() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAddNoCarry() const
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
constexpr bool isPointerVector() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr ElementCount getElementCount() const
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr bool isPointerOrPointerVector() const
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
StringRef getName() const
Return a constant reference to the value's name.
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ Mul
Product of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static const fltSemantics & IEEEsingle() LLVM_READNONE
static const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.