34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
367 const LLT Ty = Query.Types[TypeIdx];
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
377 bool IsLoad,
bool IsAtomic) {
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
394 return IsLoad ? 512 : 128;
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
408 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
413 unsigned AS = Query.
Types[1].getAddressSpace();
427 if (IsLoad && MemSize <
Size)
428 MemSize = std::max(MemSize,
Align);
437 AtomicOrdering::NotAtomic))
448 if (!ST.hasDwordx3LoadStores())
461 if (AlignBits < MemSize) {
464 Align(AlignBits / 8)))
507 return EltSize != 32 && EltSize != 64;
522 if (
Size != MemSizeInBits)
538 uint64_t AlignInBits,
unsigned AddrSpace,
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
559 if (AlignInBits < RoundedSize)
566 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
573 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
578 Query.
Types[1].getAddressSpace(), Opcode);
598 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
601 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
604 for (
unsigned I = 0;
I < NumParts; ++
I)
606 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
611 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
613 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
634 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
635 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
658 using namespace TargetOpcode;
660 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
673 const LLT BufferStridedPtr =
676 const LLT CodePtr = FlatPtr;
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688 const std::initializer_list<LLT> FPTypesBase = {
692 const std::initializer_list<LLT> FPTypes16 = {
696 const std::initializer_list<LLT> FPTypesPK16 = {
726 .clampMaxNumElementsStrict(0,
S16, 2)
734 .clampMaxNumElementsStrict(0,
S16, 2)
744 .clampMaxNumElementsStrict(0,
S16, 2)
752 .clampMaxNumElementsStrict(0,
S16, 2)
762 .minScalarOrElt(0,
S16)
779 .widenScalarToNextMultipleOf(0, 32)
801 .widenScalarToNextMultipleOf(0, 32)
809 .widenScalarToNextMultipleOf(0, 32);
820 .minScalarOrElt(0,
S32)
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
851 .clampMaxNumElements(0,
S8, 2)
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
889 .clampScalar(0,
S16,
S64);
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
979 .legalFor(FPTypesPK16)
993 .clampScalar(0,
S16,
S64);
1018 .clampScalar(0,
S32,
S64);
1023 .clampScalar(0,
S32,
S64);
1029 .clampScalar(0,
S32,
S64)
1030 .clampScalar(1,
S32,
S32)
1037 .clampScalar(1,
S32,
S32)
1073 FMad.customFor({
S32,
S16});
1075 FMad.customFor({
S32});
1077 FMad.customFor({
S16});
1085 FRem.minScalar(0,
S32)
1094 .clampMaxNumElements(0,
S16, 2)
1105 .clampScalar(0,
S32,
S64)
1106 .widenScalarToNextPow2(1, 32);
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0,
S16,
S64)
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0,
S16,
S64)
1154 if (
ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1158 .clampScalar(0,
S16,
S64)
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1164 .clampScalar(0,
S32,
S64)
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1171 .clampScalar(0,
S32,
S64)
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1179 .scalarSameSizeAs(1, 0);
1181 getActionDefinitionsBuilder(G_PTRMASK)
1183 .scalarSameSizeAs(1, 0)
1187 getActionDefinitionsBuilder(G_ICMP)
1198 .legalForCartesianProduct(
1199 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (
ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{
S1,
S16}});
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1,
S32,
S64)
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1216 if (
ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1,
S32,
S64)
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (
ST.has16BitInsts())
1227 ExpOps.customFor({{
S32}, {
S16}});
1229 ExpOps.customFor({
S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy,
S32)
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({
S32});
1239 if (
ST.has16BitInsts())
1240 Log2Ops.legalFor({
S16});
1242 Log2Ops.customFor({
S16});
1243 Log2Ops.scalarize(0)
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({
S32,
S16});
1249 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1253 getActionDefinitionsBuilder(G_CTPOP)
1255 .clampScalar(0,
S32,
S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1,
S32,
S64)
1259 .widenScalarToNextPow2(0, 32);
1262 if (
ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({
S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({
S1}, FPTypesBase)
1271 .lowerFor({
S1,
S16})
1272 .widenScalarToNextPow2(1)
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1281 .clampScalar(0,
S32,
S32)
1282 .clampScalar(1,
S32,
S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1291 .clampScalar(0,
S32,
S32)
1292 .clampScalar(1,
S32,
S64)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1299 .clampScalar(0,
S32,
S32)
1300 .clampScalar(1,
S32,
S64)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1309 .clampScalar(0,
S32,
S64)
1311 .widenScalarToNextPow2(0);
1313 if (
ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1316 .clampMaxNumElementsStrict(0,
S16, 2)
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0,
S16,
S32)
1323 if (
ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1326 .clampMaxNumElements(0,
S16, 2)
1328 .widenScalarToNextPow2(0)
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1334 .widenScalarToNextPow2(0)
1341 getActionDefinitionsBuilder(G_BSWAP)
1346 .widenScalarToNextPow2(0)
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354 .widenScalarToNextPow2(0)
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1361 .legalForCartesianProduct(AddrSpaces64, {
S64})
1362 .legalForCartesianProduct(AddrSpaces32, {
S32})
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1377 .legalForCartesianProduct(AddrSpaces64, {
S64})
1378 .legalForCartesianProduct(AddrSpaces32, {
S32})
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1395 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1396 bool IsLoad) ->
bool {
1400 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1414 unsigned NumRegs = (MemSize + 31) / 32;
1416 if (!
ST.hasDwordx3LoadStores())
1427 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1435 for (
unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore =
Op == G_STORE;
1438 auto &Actions = getActionDefinitionsBuilder(
Op);
1441 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1444 {
S64, GlobalPtr,
S64, GlobalAlign32},
1447 {
S32, GlobalPtr,
S8, GlobalAlign8},
1448 {
S32, GlobalPtr,
S16, GlobalAlign16},
1450 {
S32, LocalPtr,
S32, 32},
1451 {
S64, LocalPtr,
S64, 32},
1453 {
S32, LocalPtr,
S8, 8},
1454 {
S32, LocalPtr,
S16, 16},
1457 {
S32, PrivatePtr,
S32, 32},
1458 {
S32, PrivatePtr,
S8, 8},
1459 {
S32, PrivatePtr,
S16, 16},
1462 {
S32, ConstantPtr,
S32, GlobalAlign32},
1465 {
S64, ConstantPtr,
S64, GlobalAlign32},
1466 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1490 Actions.customIf(
typeIs(1, Constant32Ptr));
1516 return !Query.
Types[0].isVector() &&
1517 needToSplitMemOp(Query,
Op == G_LOAD);
1519 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1524 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1527 if (DstSize > MemSize)
1533 if (MemSize > MaxSize)
1541 return Query.
Types[0].isVector() &&
1542 needToSplitMemOp(Query,
Op == G_LOAD);
1544 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1558 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1563 if (MaxSize % EltSize == 0) {
1569 unsigned NumPieces = MemSize / MaxSize;
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1583 return std::pair(0, EltTy);
1598 return std::pair(0, EltTy);
1602 .widenScalarToNextPow2(0)
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1610 {
S32, GlobalPtr,
S16, 2 * 8},
1611 {
S32, LocalPtr,
S8, 8},
1612 {
S32, LocalPtr,
S16, 16},
1613 {
S32, PrivatePtr,
S8, 8},
1614 {
S32, PrivatePtr,
S16, 16},
1615 {
S32, ConstantPtr,
S8, 8},
1616 {
S32, ConstantPtr,
S16, 2 * 8}})
1622 if (
ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1632 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1634 ExtLoads.clampScalar(0,
S32,
S32)
1635 .widenScalarToNextPow2(0)
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1644 {
S64, GlobalPtr}, {
S64, LocalPtr},
1645 {
S32, RegionPtr}, {
S64, RegionPtr}});
1646 if (
ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (
ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1654 if (
ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{
S64, LocalPtr}});
1656 if (
ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1659 if (
ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{
S32, GlobalPtr}});
1661 if (
ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{
S32, FlatPtr}});
1664 if (
ST.hasGFX90AInsts()) {
1675 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1678 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1680 if (
ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1690 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1692 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1694 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1696 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1703 {
S32, FlatPtr}, {
S64, FlatPtr}})
1704 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1705 {
S32, RegionPtr}, {
S64, RegionPtr}});
1709 getActionDefinitionsBuilder(G_SELECT)
1711 LocalPtr, FlatPtr, PrivatePtr,
1715 .clampScalar(0,
S16,
S64)
1719 .clampMaxNumElements(0,
S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1723 .widenScalarToNextPow2(0)
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1730 if (
ST.has16BitInsts()) {
1731 if (
ST.hasVOP3PInsts()) {
1733 .clampMaxNumElements(0,
S16, 2);
1735 Shifts.legalFor({{
S16,
S16}});
1738 Shifts.widenScalarIf(
1743 const LLT AmountTy = Query.
Types[1];
1748 Shifts.clampScalar(1,
S32,
S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0,
S16,
S64);
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1760 Shifts.clampScalar(1,
S32,
S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0,
S32,
S64);
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1769 Shifts.scalarize(0);
1771 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1776 getActionDefinitionsBuilder(
Op)
1778 const LLT EltTy = Query.
Types[EltTypeIdx];
1779 const LLT VecTy = Query.
Types[VecTypeIdx];
1780 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1782 const bool isLegalVecType =
1792 return (EltSize == 32 || EltSize == 64) &&
1807 const LLT EltTy = Query.
Types[EltTypeIdx];
1808 const LLT VecTy = Query.
Types[VecTypeIdx];
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1817 .clampScalar(EltTypeIdx,
S32,
S64)
1818 .clampScalar(VecTypeIdx,
S32,
S64)
1819 .clampScalar(IdxTypeIdx,
S32,
S32)
1820 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1832 const LLT &EltTy = Query.
Types[1].getElementType();
1833 return Query.
Types[0] != EltTy;
1836 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1841 getActionDefinitionsBuilder(
Op)
1847 const LLT BigTy = Query.
Types[BigTyIdx];
1852 const LLT BigTy = Query.
Types[BigTyIdx];
1853 const LLT LitTy = Query.
Types[LitTyIdx];
1859 const LLT BigTy = Query.
Types[BigTyIdx];
1865 const LLT LitTy = Query.
Types[LitTyIdx];
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1884 if (
ST.hasScalarPackInsts()) {
1887 .minScalarOrElt(0,
S16)
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1894 BuildVector.customFor({
V2S16,
S16});
1895 BuildVector.minScalarOrElt(0,
S32);
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1907 .clampMaxNumElements(0,
S32, 32)
1908 .clampMaxNumElements(1,
S16, 2)
1909 .clampMaxNumElements(0,
S16, 64);
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1914 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1918 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1919 const LLT Ty = Query.
Types[TypeIdx];
1930 auto &Builder = getActionDefinitionsBuilder(
Op)
1934 const LLT BigTy = Query.
Types[BigTyIdx];
1940 .widenScalarToNextPow2(LitTyIdx, 16)
1948 .clampScalar(LitTyIdx,
S32,
S512)
1949 .widenScalarToNextPow2(LitTyIdx, 32)
1952 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1955 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1959 if (
Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1963 const LLT Ty = Query.
Types[LitTyIdx];
1969 Builder.widenScalarIf(
1971 const LLT Ty = Query.
Types[BigTyIdx];
1977 const LLT &Ty = Query.
Types[BigTyIdx];
1979 if (NewSizeInBits >= 256) {
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1984 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{
S32}, {
S64}});
1996 if (
ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{
V2S16}})
2001 .clampMaxNumElementsStrict(0,
S16, 2);
2002 }
else if (
ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2007 SextInReg.lowerFor({{
S32}, {
S64}});
2012 .clampScalar(0,
S32,
S64)
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2020 getActionDefinitionsBuilder(G_FSHR)
2023 .clampMaxNumElementsStrict(0,
S16, 2)
2027 if (
ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2030 .clampMaxNumElementsStrict(0,
S16, 2)
2034 getActionDefinitionsBuilder(G_FSHL)
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2044 getActionDefinitionsBuilder(G_FENCE)
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2054 .clampScalar(1,
S32,
S32)
2055 .clampScalar(0,
S32,
S64)
2056 .widenScalarToNextPow2(0)
2059 getActionDefinitionsBuilder(
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2069 if (
ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0,
S16, 2)
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2091 getLegacyLegalizerInfo().computeTables();
2101 switch (
MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2106 case TargetOpcode::G_FCEIL:
2108 case TargetOpcode::G_FREM:
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2112 case TargetOpcode::G_SITOFP:
2114 case TargetOpcode::G_UITOFP:
2116 case TargetOpcode::G_FPTOSI:
2118 case TargetOpcode::G_FPTOUI:
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2132 case TargetOpcode::G_GLOBAL_VALUE:
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2138 case TargetOpcode::G_STORE:
2140 case TargetOpcode::G_FMAD:
2142 case TargetOpcode::G_FDIV:
2144 case TargetOpcode::G_FFREXP:
2146 case TargetOpcode::G_FSQRT:
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2158 case TargetOpcode::G_FLOG2:
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2163 case TargetOpcode::G_FEXP2:
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2168 case TargetOpcode::G_FPOW:
2170 case TargetOpcode::G_FFLOOR:
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2175 case TargetOpcode::G_MUL:
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2184 case TargetOpcode::G_STACKSAVE:
2186 case TargetOpcode::G_GET_FPENV:
2188 case TargetOpcode::G_SET_FPENV:
2190 case TargetOpcode::G_TRAP:
2192 case TargetOpcode::G_DEBUGTRAP:
2212 if (ST.hasApertureRegs()) {
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2229 return B.buildUnmerge(
S32, Dst).getReg(1);
2234 Register LoadAddr =
MRI.createGenericVirtualRegister(
2244 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2246 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2263 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2266 Register QueuePtr =
MRI.createGenericVirtualRegister(
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2315 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2321 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2322 :
MI.getOperand(1).getReg();
2323 LLT DstTy =
MRI.getType(Dst);
2324 LLT SrcTy =
MRI.getType(Src);
2335 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2352 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2354 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2355 auto FlatNull =
B.buildConstant(SrcTy, 0);
2358 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2364 MI.eraseFromParent();
2376 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2380 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2385 B.buildCopy(Dst, BuildPtr);
2386 MI.eraseFromParent();
2390 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2391 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2394 SegmentNull.getReg(0));
2396 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2398 MI.eraseFromParent();
2405 B.buildExtract(Dst, Src, 0);
2406 MI.eraseFromParent();
2414 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2415 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2416 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2417 MI.eraseFromParent();
2422 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2425 Ctx.
diagnose(InvalidAddrSpaceCast);
2427 MI.eraseFromParent();
2435 LLT Ty =
MRI.getType(Src);
2441 auto C1 =
B.buildFConstant(Ty, C1Val);
2442 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2445 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2446 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2448 auto C2 =
B.buildFConstant(Ty, C2Val);
2449 auto Fabs =
B.buildFAbs(Ty, Src);
2452 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2453 MI.eraseFromParent();
2471 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2473 const auto Zero =
B.buildFConstant(
S64, 0.0);
2474 const auto One =
B.buildFConstant(
S64, 1.0);
2477 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2478 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2481 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2482 MI.eraseFromParent();
2490 Register Src0Reg =
MI.getOperand(1).getReg();
2491 Register Src1Reg =
MI.getOperand(2).getReg();
2492 auto Flags =
MI.getFlags();
2493 LLT Ty =
MRI.getType(DstReg);
2495 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2496 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2497 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2498 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2499 MI.eraseFromParent();
2505 const unsigned FractBits = 52;
2506 const unsigned ExpBits = 11;
2509 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2510 auto Const1 =
B.buildConstant(
S32, ExpBits);
2512 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2514 .addUse(Const0.getReg(0))
2515 .addUse(Const1.getReg(0));
2517 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2531 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2538 const unsigned FractBits = 52;
2541 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2542 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2544 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2546 const auto Zero32 =
B.buildConstant(
S32, 0);
2549 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2551 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2552 auto Not =
B.buildNot(
S64, Shr);
2553 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2554 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2559 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2560 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2561 MI.eraseFromParent();
2577 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2578 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2580 if (
MRI.getType(Dst) ==
S64) {
2581 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2582 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2584 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2585 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2588 B.buildFAdd(Dst, LdExp, CvtLo);
2589 MI.eraseFromParent();
2595 auto One =
B.buildConstant(
S32, 1);
2599 auto ThirtyOne =
B.buildConstant(
S32, 31);
2600 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2601 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2602 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2603 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2604 .addUse(Unmerge.getReg(1));
2605 auto LS2 =
B.buildSub(
S32, LS, One);
2606 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2608 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2609 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2610 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2611 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2612 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2613 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2614 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2615 B.buildFLdexp(Dst, FVal, Scale);
2616 MI.eraseFromParent();
2633 const LLT SrcLT =
MRI.getType(Src);
2636 unsigned Flags =
MI.getFlags();
2647 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2655 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2656 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2660 K0 =
B.buildFConstant(
2661 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2662 K1 =
B.buildFConstant(
2663 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2665 K0 =
B.buildFConstant(
2666 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2667 K1 =
B.buildFConstant(
2668 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2671 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2672 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2673 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2676 :
B.buildFPTOUI(
S32, FloorMul);
2677 auto Lo =
B.buildFPTOUI(
S32, Fma);
2681 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2683 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2686 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2687 MI.eraseFromParent();
2697 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2698 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2721 LLT VecTy =
MRI.getType(Vec);
2734 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2735 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2736 B.buildIntToPtr(Dst, IntElt);
2738 MI.eraseFromParent();
2745 std::optional<ValueAndVReg> MaybeIdxVal =
2749 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2752 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2753 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2758 MI.eraseFromParent();
2773 LLT VecTy =
MRI.getType(Vec);
2787 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2788 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2789 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2791 B.buildIntToPtr(Dst, IntVecDest);
2792 MI.eraseFromParent();
2799 std::optional<ValueAndVReg> MaybeIdxVal =
2804 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2807 if (IdxVal < NumElts) {
2809 for (
unsigned i = 0; i < NumElts; ++i)
2810 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2811 B.buildUnmerge(SrcRegs, Vec);
2813 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2814 B.buildMergeLikeInstr(Dst, SrcRegs);
2819 MI.eraseFromParent();
2829 LLT Ty =
MRI.getType(DstReg);
2830 unsigned Flags =
MI.getFlags();
2835 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2836 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2837 .addUse(MulVal.getReg(0))
2841 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2844 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2848 MI.eraseFromParent();
2856 unsigned GAFlags)
const {
2857 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2885 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2896 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2897 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2900 B.buildExtract(DstReg, PCReg, 0);
2914 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2916 :
MRI.createGenericVirtualRegister(
S32);
2918 if (!
MRI.getRegClassOrNull(AddrLo))
2919 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2922 B.buildInstr(AMDGPU::S_MOV_B32)
2927 if (RequiresHighHalf) {
2929 "Must provide a 64-bit pointer type!");
2932 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2934 B.buildInstr(AMDGPU::S_MOV_B32)
2944 if (!
MRI.getRegClassOrNull(AddrDst))
2945 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2947 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2951 if (AddrDst != DstReg)
2952 B.buildCast(DstReg, AddrDst);
2953 }
else if (AddrLo != DstReg) {
2956 B.buildCast(DstReg, AddrLo);
2964 LLT Ty =
MRI.getType(DstReg);
2973 GV->
getName() !=
"llvm.amdgcn.module.lds") {
2976 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2986 B.buildUndef(DstReg);
2987 MI.eraseFromParent();
3007 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3011 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3012 B.buildIntToPtr(DstReg, Sz);
3013 MI.eraseFromParent();
3019 *cast<GlobalVariable>(GV)));
3020 MI.eraseFromParent();
3026 MI.eraseFromParent();
3034 MI.eraseFromParent();
3040 MI.eraseFromParent();
3045 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3058 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3059 B.buildExtract(DstReg, Load, 0);
3061 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3063 MI.eraseFromParent();
3081 LLT PtrTy =
MRI.getType(PtrReg);
3086 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3088 MI.getOperand(1).setReg(Cast.getReg(0));
3093 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3097 LLT ValTy =
MRI.getType(ValReg);
3119 if (WideMemSize == ValSize) {
3125 MI.setMemRefs(MF, {WideMMO});
3131 if (ValSize > WideMemSize)
3138 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3139 B.buildTrunc(ValReg, WideLoad).getReg(0);
3146 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3147 B.buildExtract(ValReg, WideLoad, 0);
3151 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3156 MI.eraseFromParent();
3169 Register DataReg =
MI.getOperand(0).getReg();
3170 LLT DataTy =
MRI.getType(DataReg);
3184 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3213 "this should not have been custom lowered");
3215 LLT ValTy =
MRI.getType(CmpVal);
3218 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3220 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3224 .setMemRefs(
MI.memoperands());
3226 MI.eraseFromParent();
3235 case TargetOpcode::G_INTRINSIC: {
3237 case Intrinsic::amdgcn_frexp_mant:
3245 case TargetOpcode::G_FFREXP: {
3250 case TargetOpcode::G_FPEXT: {
3274std::pair<Register, Register>
3276 unsigned Flags)
const {
3281 auto SmallestNormal =
B.buildFConstant(
3283 auto IsLtSmallestNormal =
3286 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3287 auto One =
B.buildFConstant(
F32, 1.0);
3289 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3290 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3292 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3305 LLT Ty =
B.getMRI()->getType(Dst);
3306 unsigned Flags =
MI.getFlags();
3311 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3312 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3313 .addUse(Ext.getReg(0))
3315 B.buildFPTrunc(Dst,
Log2, Flags);
3316 MI.eraseFromParent();
3324 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3327 MI.eraseFromParent();
3331 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3332 .addUse(ScaledInput)
3335 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3336 auto Zero =
B.buildFConstant(Ty, 0.0);
3338 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3339 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3341 MI.eraseFromParent();
3347 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3348 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3353 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3354 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3359 unsigned Flags =
MI.getFlags();
3360 const LLT Ty =
MRI.getType(
X);
3370 TM.Options.ApproxFuncFPMath ||
TM.Options.UnsafeFPMath) {
3373 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3375 B.buildFPTrunc(Dst, LogVal);
3380 MI.eraseFromParent();
3389 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3394 const float c_log10 = 0x1.344134p-2f;
3395 const float cc_log10 = 0x1.09f79ep-26f;
3398 const float c_log = 0x1.62e42ep-1f;
3399 const float cc_log = 0x1.efa39ep-25f;
3401 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3402 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3404 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3405 auto NegR =
B.buildFNeg(Ty, R, Flags);
3406 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3407 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3408 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3411 const float ch_log10 = 0x1.344000p-2f;
3412 const float ct_log10 = 0x1.3509f6p-18f;
3415 const float ch_log = 0x1.62e000p-1f;
3416 const float ct_log = 0x1.0bfbe8p-15f;
3418 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3419 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3421 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3422 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3423 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3424 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3427 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3429 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3432 const bool IsFiniteOnly =
3436 if (!IsFiniteOnly) {
3439 auto Fabs =
B.buildFAbs(Ty,
Y);
3442 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3446 auto Zero =
B.buildFConstant(Ty, 0.0);
3448 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3449 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3450 B.buildFSub(Dst, R, Shift, Flags);
3452 B.buildCopy(Dst, R);
3455 MI.eraseFromParent();
3461 unsigned Flags)
const {
3462 const double Log2BaseInverted =
3465 LLT Ty =
B.getMRI()->getType(Dst);
3470 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3473 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3474 auto Zero =
B.buildFConstant(Ty, 0.0);
3476 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3477 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3480 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3482 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3483 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3491 ?
B.buildFLog2(Ty, Src, Flags)
3492 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3495 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3496 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3507 unsigned Flags =
MI.getFlags();
3508 LLT Ty =
B.getMRI()->getType(Dst);
3514 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3515 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3516 .addUse(Ext.getReg(0))
3518 B.buildFPTrunc(Dst,
Log2, Flags);
3519 MI.eraseFromParent();
3529 MI.eraseFromParent();
3537 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3539 RangeCheckConst, Flags);
3541 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3542 auto Zero =
B.buildFConstant(Ty, 0.0);
3543 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3544 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3546 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3547 .addUse(AddInput.getReg(0))
3550 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3551 auto One =
B.buildFConstant(Ty, 1.0);
3552 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3553 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3554 MI.eraseFromParent();
3560 LLT Ty =
B.getMRI()->getType(Dst);
3565 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3569 .addUse(
Mul.getReg(0))
3572 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3578 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3581 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3582 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3583 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3586 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3588 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3589 .addUse(ExpInput.getReg(0))
3592 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3593 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3594 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3602 const unsigned Flags =
MI.getFlags();
3605 LLT Ty =
MRI.getType(Dst);
3608 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3615 MI.eraseFromParent();
3623 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3626 B.buildFPTrunc(Dst, Lowered, Flags);
3627 MI.eraseFromParent();
3637 MI.eraseFromParent();
3665 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3670 const float cc_exp = 0x1.4ae0bep-26f;
3671 const float c_exp10 = 0x1.a934f0p+1f;
3672 const float cc_exp10 = 0x1.2f346ep-24f;
3674 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3675 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3676 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3677 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3679 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3680 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3682 const float ch_exp = 0x1.714000p+0f;
3683 const float cl_exp = 0x1.47652ap-12f;
3685 const float ch_exp10 = 0x1.a92000p+1f;
3686 const float cl_exp10 = 0x1.4f0978p-11f;
3688 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3689 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3690 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3692 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3693 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3695 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3696 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3699 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3700 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3703 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3706 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3707 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3710 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3711 .addUse(
A.getReg(0))
3713 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3715 auto UnderflowCheckConst =
3716 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3717 auto Zero =
B.buildFConstant(Ty, 0.0);
3721 R =
B.buildSelect(Ty, Underflow, Zero, R);
3726 auto OverflowCheckConst =
3727 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3732 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3735 B.buildCopy(Dst, R);
3736 MI.eraseFromParent();
3745 unsigned Flags =
MI.getFlags();
3746 LLT Ty =
B.getMRI()->getType(Dst);
3751 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3752 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3753 .addUse(Log.getReg(0))
3756 B.buildFExp2(Dst,
Mul, Flags);
3757 }
else if (Ty == F16) {
3759 auto Log =
B.buildFLog2(F16, Src0, Flags);
3760 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3761 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3762 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3763 .addUse(Ext0.getReg(0))
3764 .addUse(Ext1.getReg(0))
3766 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3770 MI.eraseFromParent();
3778 ModSrc = SrcFNeg->getOperand(1).getReg();
3780 ModSrc = SrcFAbs->getOperand(1).getReg();
3782 ModSrc = SrcFAbs->getOperand(1).getReg();
3793 Register OrigSrc =
MI.getOperand(1).getReg();
3794 unsigned Flags =
MI.getFlags();
3796 "this should not have been custom lowered");
3806 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3818 B.buildFConstant(
F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3826 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3828 B.buildFMinNum(Min, Fract, Const, Flags);
3833 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3836 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3837 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3839 MI.eraseFromParent();
3855 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3857 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3858 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3861 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3862 B.buildBitcast(Dst,
Merge);
3864 MI.eraseFromParent();
3881 bool UsePartialMad64_32,
3882 bool SeparateOddAlignedProducts)
const {
3897 auto getZero32 = [&]() ->
Register {
3899 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3902 auto getZero64 = [&]() ->
Register {
3904 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3909 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3920 if (CarryIn.empty())
3923 bool HaveCarryOut =
true;
3925 if (CarryIn.size() == 1) {
3927 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3931 CarryAccum = getZero32();
3933 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3934 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3936 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3941 LocalAccum = getZero32();
3942 HaveCarryOut =
false;
3947 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3948 LocalAccum =
Add.getReg(0);
3962 auto buildMadChain =
3965 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3966 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3973 if (LocalAccum.size() == 1 &&
3974 (!UsePartialMad64_32 || !CarryIn.empty())) {
3977 unsigned j1 = DstIndex - j0;
3978 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3982 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3984 LocalAccum[0] =
Mul.getReg(0);
3986 if (CarryIn.empty()) {
3987 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3990 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
3996 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4000 if (j0 <= DstIndex) {
4001 bool HaveSmallAccum =
false;
4004 if (LocalAccum[0]) {
4005 if (LocalAccum.size() == 1) {
4006 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4007 HaveSmallAccum =
true;
4008 }
else if (LocalAccum[1]) {
4009 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4010 HaveSmallAccum =
false;
4012 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4013 HaveSmallAccum =
true;
4016 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4018 HaveSmallAccum =
true;
4022 unsigned j1 = DstIndex - j0;
4023 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4027 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4028 {Src0[j0], Src1[j1], Tmp});
4029 Tmp = Mad.getReg(0);
4030 if (!HaveSmallAccum)
4031 CarryOut.push_back(Mad.getReg(1));
4032 HaveSmallAccum =
false;
4035 }
while (j0 <= DstIndex);
4037 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4038 LocalAccum[0] = Unmerge.getReg(0);
4039 if (LocalAccum.size() > 1)
4040 LocalAccum[1] = Unmerge.getReg(1);
4067 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4068 Carry OddCarryIn = std::move(OddCarry);
4069 Carry EvenCarryIn = std::move(EvenCarry);
4074 if (2 * i < Accum.
size()) {
4075 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4076 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4081 if (!SeparateOddAlignedProducts) {
4082 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4083 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4085 bool IsHighest = 2 * i >= Accum.
size();
4089 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4095 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4097 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4099 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4102 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4105 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4106 Lo->getOperand(1).getReg());
4107 Accum[2 * i] =
Hi.getReg(0);
4108 SeparateOddCarry =
Hi.getReg(1);
4115 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4116 EvenCarryIn.push_back(CarryOut);
4118 if (2 * i < Accum.
size()) {
4119 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4120 OddCarry.push_back(CarryOut);
4133 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4142 LLT Ty =
MRI.getType(DstReg);
4146 unsigned NumParts =
Size / 32;
4162 for (
unsigned i = 0; i < NumParts; ++i) {
4166 B.buildUnmerge(Src0Parts, Src0);
4167 B.buildUnmerge(Src1Parts, Src1);
4170 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4171 SeparateOddAlignedProducts);
4173 B.buildMergeLikeInstr(DstReg, AccumRegs);
4174 MI.eraseFromParent();
4186 LLT DstTy =
MRI.getType(Dst);
4187 LLT SrcTy =
MRI.getType(Src);
4189 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4190 ? AMDGPU::G_AMDGPU_FFBH_U32
4191 : AMDGPU::G_AMDGPU_FFBL_B32;
4192 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4195 MI.eraseFromParent();
4204 LLT SrcTy =
MRI.getType(Src);
4209 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4210 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4211 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4212 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4213 B.buildTrunc(Dst, Ctlz);
4214 MI.eraseFromParent();
4220 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4223 return ConstVal && *ConstVal == -1;
4230 Register CondDef =
MI.getOperand(0).getReg();
4231 if (!
MRI.hasOneNonDBGUse(CondDef))
4239 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4245 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4254 if (Next == Parent->
end()) {
4258 UncondBrTarget = &*NextMBB;
4260 if (Next->getOpcode() != AMDGPU::G_BR)
4278 *ArgRC,
B.getDebugLoc(), ArgTy);
4282 const unsigned Mask = Arg->
getMask();
4283 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4290 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4291 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4294 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4296 B.buildCopy(DstReg, LiveIn);
4325 Arg = &WorkGroupIDX;
4326 ArgRC = &AMDGPU::SReg_32RegClass;
4330 Arg = &WorkGroupIDY;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4335 Arg = &WorkGroupIDZ;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4351 B.buildConstant(DstReg, 0);
4357 B.buildUndef(DstReg);
4361 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4372 MI.eraseFromParent();
4378 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4379 MI.eraseFromParent();
4400 B.buildUndef(DstReg);
4401 MI.eraseFromParent();
4405 if (Arg->isMasked()) {
4419 MI.eraseFromParent();
4426 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4436 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4444 Align Alignment)
const {
4448 "unexpected kernarg parameter type");
4452 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4455 MI.eraseFromParent();