34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
367 const LLT Ty = Query.Types[TypeIdx];
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
377 bool IsLoad,
bool IsAtomic) {
381 return ST.enableFlatScratch() ? 128 : 32;
383 return ST.useDS128() ? 128 : 64;
394 return IsLoad ? 512 : 128;
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
408 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
413 unsigned AS = Query.
Types[1].getAddressSpace();
427 if (IsLoad && MemSize <
Size)
428 MemSize = std::max(MemSize,
Align);
437 AtomicOrdering::NotAtomic))
448 if (!ST.hasDwordx3LoadStores())
461 if (AlignBits < MemSize) {
464 Align(AlignBits / 8)))
507 return EltSize != 32 && EltSize != 64;
522 if (
Size != MemSizeInBits)
538 uint64_t AlignInBits,
unsigned AddrSpace,
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
559 if (AlignInBits < RoundedSize)
566 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
573 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
578 Query.
Types[1].getAddressSpace(), Opcode);
598 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
601 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
604 for (
unsigned I = 0;
I < NumParts; ++
I)
606 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
611 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
613 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
634 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
635 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
636 for (
unsigned I = 0;
I < NumParts; ++
I)
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
640 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
658 using namespace TargetOpcode;
660 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
673 const LLT BufferStridedPtr =
676 const LLT CodePtr = FlatPtr;
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
688 const std::initializer_list<LLT> FPTypesBase = {
692 const std::initializer_list<LLT> FPTypes16 = {
696 const std::initializer_list<LLT> FPTypesPK16 = {
726 .clampMaxNumElementsStrict(0,
S16, 2)
734 .clampMaxNumElementsStrict(0,
S16, 2)
744 .clampMaxNumElementsStrict(0,
S16, 2)
752 .clampMaxNumElementsStrict(0,
S16, 2)
762 .minScalarOrElt(0,
S16)
779 .widenScalarToNextMultipleOf(0, 32)
801 .widenScalarToNextMultipleOf(0, 32)
809 .widenScalarToNextMultipleOf(0, 32);
820 .minScalarOrElt(0,
S32)
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
851 .clampMaxNumElements(0,
S8, 2)
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
889 .clampScalar(0,
S16,
S64);
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
979 .legalFor(FPTypesPK16)
993 .clampScalar(0,
S16,
S64);
1018 .clampScalar(0,
S32,
S64);
1023 .clampScalar(0,
S32,
S64);
1029 .clampScalar(0,
S32,
S64)
1030 .clampScalar(1,
S32,
S32)
1037 .clampScalar(1,
S32,
S32)
1073 FMad.customFor({
S32,
S16});
1075 FMad.customFor({
S32});
1077 FMad.customFor({
S16});
1085 FRem.minScalar(0,
S32)
1094 .clampMaxNumElements(0,
S16, 2)
1105 .clampScalar(0,
S32,
S64)
1106 .widenScalarToNextPow2(1, 32);
1134 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1135 .clampScalar(0,
S16,
S64)
1139 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1145 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1149 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1150 .clampScalar(0,
S16,
S64)
1154 if (
ST.has16BitInsts()) {
1155 getActionDefinitionsBuilder(
1156 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1158 .clampScalar(0,
S16,
S64)
1161 getActionDefinitionsBuilder(
1162 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1164 .clampScalar(0,
S32,
S64)
1167 getActionDefinitionsBuilder(
1168 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1171 .clampScalar(0,
S32,
S64)
1175 getActionDefinitionsBuilder(G_PTR_ADD)
1176 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1179 .scalarSameSizeAs(1, 0);
1181 getActionDefinitionsBuilder(G_PTRMASK)
1183 .scalarSameSizeAs(1, 0)
1187 getActionDefinitionsBuilder(G_ICMP)
1198 .legalForCartesianProduct(
1199 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1200 .legalForCartesianProduct(
1201 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1202 if (
ST.has16BitInsts()) {
1203 CmpBuilder.legalFor({{
S1,
S16}});
1207 .widenScalarToNextPow2(1)
1208 .clampScalar(1,
S32,
S64)
1213 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1214 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1216 if (
ST.hasSALUFloatInsts())
1217 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1220 .widenScalarToNextPow2(1)
1221 .clampScalar(1,
S32,
S64)
1225 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1226 if (
ST.has16BitInsts())
1227 ExpOps.customFor({{
S32}, {
S16}});
1229 ExpOps.customFor({
S32});
1230 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1233 getActionDefinitionsBuilder(G_FPOWI)
1234 .clampScalar(0, MinScalarFPTy,
S32)
1237 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1238 Log2Ops.customFor({
S32});
1239 if (
ST.has16BitInsts())
1240 Log2Ops.legalFor({
S16});
1242 Log2Ops.customFor({
S16});
1243 Log2Ops.scalarize(0)
1247 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1248 LogOps.customFor({
S32,
S16});
1249 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1253 getActionDefinitionsBuilder(G_CTPOP)
1255 .clampScalar(0,
S32,
S32)
1256 .widenScalarToNextPow2(1, 32)
1257 .clampScalar(1,
S32,
S64)
1259 .widenScalarToNextPow2(0, 32);
1262 if (
ST.has16BitInsts())
1263 getActionDefinitionsBuilder(G_IS_FPCLASS)
1264 .legalForCartesianProduct({
S1}, FPTypes16)
1265 .widenScalarToNextPow2(1)
1269 getActionDefinitionsBuilder(G_IS_FPCLASS)
1270 .legalForCartesianProduct({
S1}, FPTypesBase)
1271 .lowerFor({
S1,
S16})
1272 .widenScalarToNextPow2(1)
1279 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1281 .clampScalar(0,
S32,
S32)
1282 .clampScalar(1,
S32,
S64)
1283 .widenScalarToNextPow2(0, 32)
1284 .widenScalarToNextPow2(1, 32)
1288 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1291 .clampScalar(0,
S32,
S32)
1292 .clampScalar(1,
S32,
S64)
1294 .widenScalarToNextPow2(0, 32)
1295 .widenScalarToNextPow2(1, 32);
1297 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1299 .clampScalar(0,
S32,
S32)
1300 .clampScalar(1,
S32,
S64)
1302 .widenScalarToNextPow2(0, 32)
1303 .widenScalarToNextPow2(1, 32);
1307 getActionDefinitionsBuilder(G_BITREVERSE)
1309 .clampScalar(0,
S32,
S64)
1311 .widenScalarToNextPow2(0);
1313 if (
ST.has16BitInsts()) {
1314 getActionDefinitionsBuilder(G_BSWAP)
1316 .clampMaxNumElementsStrict(0,
S16, 2)
1319 .widenScalarToNextPow2(0)
1320 .clampScalar(0,
S16,
S32)
1323 if (
ST.hasVOP3PInsts()) {
1324 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1326 .clampMaxNumElements(0,
S16, 2)
1328 .widenScalarToNextPow2(0)
1332 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1334 .widenScalarToNextPow2(0)
1341 getActionDefinitionsBuilder(G_BSWAP)
1346 .widenScalarToNextPow2(0)
1351 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354 .widenScalarToNextPow2(0)
1359 getActionDefinitionsBuilder(G_INTTOPTR)
1361 .legalForCartesianProduct(AddrSpaces64, {
S64})
1362 .legalForCartesianProduct(AddrSpaces32, {
S32})
1375 getActionDefinitionsBuilder(G_PTRTOINT)
1377 .legalForCartesianProduct(AddrSpaces64, {
S64})
1378 .legalForCartesianProduct(AddrSpaces32, {
S32})
1391 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1395 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1396 bool IsLoad) ->
bool {
1400 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1414 unsigned NumRegs = (MemSize + 31) / 32;
1416 if (!
ST.hasDwordx3LoadStores())
1427 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1428 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1429 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1435 for (
unsigned Op : {G_LOAD, G_STORE}) {
1436 const bool IsStore =
Op == G_STORE;
1438 auto &Actions = getActionDefinitionsBuilder(
Op);
1441 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1444 {
S64, GlobalPtr,
S64, GlobalAlign32},
1447 {
S32, GlobalPtr,
S8, GlobalAlign8},
1448 {
S32, GlobalPtr,
S16, GlobalAlign16},
1450 {
S32, LocalPtr,
S32, 32},
1451 {
S64, LocalPtr,
S64, 32},
1453 {
S32, LocalPtr,
S8, 8},
1454 {
S32, LocalPtr,
S16, 16},
1457 {
S32, PrivatePtr,
S32, 32},
1458 {
S32, PrivatePtr,
S8, 8},
1459 {
S32, PrivatePtr,
S16, 16},
1462 {
S32, ConstantPtr,
S32, GlobalAlign32},
1465 {
S64, ConstantPtr,
S64, GlobalAlign32},
1466 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1475 Actions.unsupportedIf(
1476 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1490 Actions.customIf(
typeIs(1, Constant32Ptr));
1516 return !Query.
Types[0].isVector() &&
1517 needToSplitMemOp(Query,
Op == G_LOAD);
1519 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1524 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1527 if (DstSize > MemSize)
1533 if (MemSize > MaxSize)
1541 return Query.
Types[0].isVector() &&
1542 needToSplitMemOp(Query,
Op == G_LOAD);
1544 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1558 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1559 if (MemSize > MaxSize) {
1563 if (MaxSize % EltSize == 0) {
1569 unsigned NumPieces = MemSize / MaxSize;
1573 if (NumPieces == 1 || NumPieces >= NumElts ||
1574 NumElts % NumPieces != 0)
1575 return std::pair(0, EltTy);
1583 return std::pair(0, EltTy);
1598 return std::pair(0, EltTy);
1602 .widenScalarToNextPow2(0)
1608 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1609 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1610 {
S32, GlobalPtr,
S16, 2 * 8},
1611 {
S32, LocalPtr,
S8, 8},
1612 {
S32, LocalPtr,
S16, 16},
1613 {
S32, PrivatePtr,
S8, 8},
1614 {
S32, PrivatePtr,
S16, 16},
1615 {
S32, ConstantPtr,
S8, 8},
1616 {
S32, ConstantPtr,
S16, 2 * 8}})
1622 if (
ST.hasFlatAddressSpace()) {
1623 ExtLoads.legalForTypesWithMemDesc(
1624 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1632 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1634 ExtLoads.clampScalar(0,
S32,
S32)
1635 .widenScalarToNextPow2(0)
1638 auto &Atomics = getActionDefinitionsBuilder(
1639 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1640 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1641 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1642 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1643 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1644 {
S64, GlobalPtr}, {
S64, LocalPtr},
1645 {
S32, RegionPtr}, {
S64, RegionPtr}});
1646 if (
ST.hasFlatAddressSpace()) {
1647 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1651 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1652 if (
ST.hasLDSFPAtomicAddF32()) {
1653 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1654 if (
ST.hasLdsAtomicAddF64())
1655 Atomic.legalFor({{
S64, LocalPtr}});
1656 if (
ST.hasAtomicDsPkAdd16Insts())
1657 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1659 if (
ST.hasAtomicFaddInsts())
1660 Atomic.legalFor({{
S32, GlobalPtr}});
1661 if (
ST.hasFlatAtomicFaddF32Inst())
1662 Atomic.legalFor({{
S32, FlatPtr}});
1664 if (
ST.hasGFX90AInsts()) {
1675 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1676 ST.hasAtomicBufferGlobalPkAddF16Insts())
1677 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1678 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1679 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1680 if (
ST.hasAtomicFlatPkAdd16Insts())
1681 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1686 auto &AtomicFMinFMax =
1687 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1688 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1690 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1691 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1692 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1693 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1694 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1695 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1696 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1697 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1701 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1702 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1703 {
S32, FlatPtr}, {
S64, FlatPtr}})
1704 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1705 {
S32, RegionPtr}, {
S64, RegionPtr}});
1709 getActionDefinitionsBuilder(G_SELECT)
1711 LocalPtr, FlatPtr, PrivatePtr,
1715 .clampScalar(0,
S16,
S64)
1719 .clampMaxNumElements(0,
S32, 2)
1720 .clampMaxNumElements(0, LocalPtr, 2)
1721 .clampMaxNumElements(0, PrivatePtr, 2)
1723 .widenScalarToNextPow2(0)
1728 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1730 if (
ST.has16BitInsts()) {
1731 if (
ST.hasVOP3PInsts()) {
1733 .clampMaxNumElements(0,
S16, 2);
1735 Shifts.legalFor({{
S16,
S16}});
1738 Shifts.widenScalarIf(
1743 const LLT AmountTy = Query.
Types[1];
1748 Shifts.clampScalar(1,
S32,
S32);
1749 Shifts.widenScalarToNextPow2(0, 16);
1750 Shifts.clampScalar(0,
S16,
S64);
1752 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1760 Shifts.clampScalar(1,
S32,
S32);
1761 Shifts.widenScalarToNextPow2(0, 32);
1762 Shifts.clampScalar(0,
S32,
S64);
1764 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1769 Shifts.scalarize(0);
1771 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1772 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1773 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1774 unsigned IdxTypeIdx = 2;
1776 getActionDefinitionsBuilder(
Op)
1778 const LLT EltTy = Query.
Types[EltTypeIdx];
1779 const LLT VecTy = Query.
Types[VecTypeIdx];
1780 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1782 const bool isLegalVecType =
1792 return (EltSize == 32 || EltSize == 64) &&
1807 const LLT EltTy = Query.
Types[EltTypeIdx];
1808 const LLT VecTy = Query.
Types[VecTypeIdx];
1812 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1817 .clampScalar(EltTypeIdx,
S32,
S64)
1818 .clampScalar(VecTypeIdx,
S32,
S64)
1819 .clampScalar(IdxTypeIdx,
S32,
S32)
1820 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1830 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1832 const LLT &EltTy = Query.
Types[1].getElementType();
1833 return Query.
Types[0] != EltTy;
1836 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1837 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1838 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1841 getActionDefinitionsBuilder(
Op)
1847 const LLT BigTy = Query.
Types[BigTyIdx];
1852 const LLT BigTy = Query.
Types[BigTyIdx];
1853 const LLT LitTy = Query.
Types[LitTyIdx];
1859 const LLT BigTy = Query.
Types[BigTyIdx];
1865 const LLT LitTy = Query.
Types[LitTyIdx];
1870 .widenScalarToNextPow2(BigTyIdx, 32);
1874 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1884 if (
ST.hasScalarPackInsts()) {
1887 .minScalarOrElt(0,
S16)
1890 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1894 BuildVector.customFor({
V2S16,
S16});
1895 BuildVector.minScalarOrElt(0,
S32);
1897 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1905 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1907 .clampMaxNumElements(0,
S32, 32)
1908 .clampMaxNumElements(1,
S16, 2)
1909 .clampMaxNumElements(0,
S16, 64);
1911 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1914 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1915 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1916 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1918 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1919 const LLT Ty = Query.
Types[TypeIdx];
1930 auto &Builder = getActionDefinitionsBuilder(
Op)
1934 const LLT BigTy = Query.
Types[BigTyIdx];
1940 .widenScalarToNextPow2(LitTyIdx, 16)
1948 .clampScalar(LitTyIdx,
S32,
S512)
1949 .widenScalarToNextPow2(LitTyIdx, 32)
1952 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1955 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1959 if (
Op == G_MERGE_VALUES) {
1960 Builder.widenScalarIf(
1963 const LLT Ty = Query.
Types[LitTyIdx];
1969 Builder.widenScalarIf(
1971 const LLT Ty = Query.
Types[BigTyIdx];
1977 const LLT &Ty = Query.
Types[BigTyIdx];
1979 if (NewSizeInBits >= 256) {
1981 if (RoundedTo < NewSizeInBits)
1982 NewSizeInBits = RoundedTo;
1984 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1993 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1994 .legalFor({{
S32}, {
S64}});
1996 if (
ST.hasVOP3PInsts()) {
1997 SextInReg.lowerFor({{
V2S16}})
2001 .clampMaxNumElementsStrict(0,
S16, 2);
2002 }
else if (
ST.has16BitInsts()) {
2003 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2007 SextInReg.lowerFor({{
S32}, {
S64}});
2012 .clampScalar(0,
S32,
S64)
2015 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2020 getActionDefinitionsBuilder(G_FSHR)
2023 .clampMaxNumElementsStrict(0,
S16, 2)
2027 if (
ST.hasVOP3PInsts()) {
2028 getActionDefinitionsBuilder(G_FSHL)
2030 .clampMaxNumElementsStrict(0,
S16, 2)
2034 getActionDefinitionsBuilder(G_FSHL)
2039 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2042 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2044 getActionDefinitionsBuilder(G_FENCE)
2047 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2052 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2054 .clampScalar(1,
S32,
S32)
2055 .clampScalar(0,
S32,
S64)
2056 .widenScalarToNextPow2(0)
2059 getActionDefinitionsBuilder(
2063 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2064 G_READ_REGISTER, G_WRITE_REGISTER,
2069 if (
ST.hasIEEEMinMax()) {
2070 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2071 .legalFor(FPTypesPK16)
2072 .clampMaxNumElements(0,
S16, 2)
2076 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2079 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2082 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2084 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2085 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2086 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2089 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2091 getLegacyLegalizerInfo().computeTables();
2101 switch (
MI.getOpcode()) {
2102 case TargetOpcode::G_ADDRSPACE_CAST:
2104 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2106 case TargetOpcode::G_FCEIL:
2108 case TargetOpcode::G_FREM:
2110 case TargetOpcode::G_INTRINSIC_TRUNC:
2112 case TargetOpcode::G_SITOFP:
2114 case TargetOpcode::G_UITOFP:
2116 case TargetOpcode::G_FPTOSI:
2118 case TargetOpcode::G_FPTOUI:
2120 case TargetOpcode::G_FMINNUM:
2121 case TargetOpcode::G_FMAXNUM:
2122 case TargetOpcode::G_FMINNUM_IEEE:
2123 case TargetOpcode::G_FMAXNUM_IEEE:
2125 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2127 case TargetOpcode::G_INSERT_VECTOR_ELT:
2129 case TargetOpcode::G_FSIN:
2130 case TargetOpcode::G_FCOS:
2132 case TargetOpcode::G_GLOBAL_VALUE:
2134 case TargetOpcode::G_LOAD:
2135 case TargetOpcode::G_SEXTLOAD:
2136 case TargetOpcode::G_ZEXTLOAD:
2138 case TargetOpcode::G_STORE:
2140 case TargetOpcode::G_FMAD:
2142 case TargetOpcode::G_FDIV:
2144 case TargetOpcode::G_FFREXP:
2146 case TargetOpcode::G_FSQRT:
2148 case TargetOpcode::G_UDIV:
2149 case TargetOpcode::G_UREM:
2150 case TargetOpcode::G_UDIVREM:
2152 case TargetOpcode::G_SDIV:
2153 case TargetOpcode::G_SREM:
2154 case TargetOpcode::G_SDIVREM:
2156 case TargetOpcode::G_ATOMIC_CMPXCHG:
2158 case TargetOpcode::G_FLOG2:
2160 case TargetOpcode::G_FLOG:
2161 case TargetOpcode::G_FLOG10:
2163 case TargetOpcode::G_FEXP2:
2165 case TargetOpcode::G_FEXP:
2166 case TargetOpcode::G_FEXP10:
2168 case TargetOpcode::G_FPOW:
2170 case TargetOpcode::G_FFLOOR:
2172 case TargetOpcode::G_BUILD_VECTOR:
2173 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2175 case TargetOpcode::G_MUL:
2177 case TargetOpcode::G_CTLZ:
2178 case TargetOpcode::G_CTTZ:
2180 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2182 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2184 case TargetOpcode::G_STACKSAVE:
2186 case TargetOpcode::G_GET_FPENV:
2188 case TargetOpcode::G_SET_FPENV:
2190 case TargetOpcode::G_TRAP:
2192 case TargetOpcode::G_DEBUGTRAP:
2212 if (ST.hasApertureRegs()) {
2217 ? AMDGPU::SRC_SHARED_BASE
2218 : AMDGPU::SRC_PRIVATE_BASE;
2227 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2228 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2229 return B.buildUnmerge(
S32, Dst).getReg(1);
2234 Register LoadAddr =
MRI.createGenericVirtualRegister(
2244 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2246 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2260 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2263 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2266 Register QueuePtr =
MRI.createGenericVirtualRegister(
2282 B.buildPtrAdd(LoadAddr, QueuePtr,
2283 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2284 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2292 switch (Def->getOpcode()) {
2293 case AMDGPU::G_FRAME_INDEX:
2294 case AMDGPU::G_GLOBAL_VALUE:
2295 case AMDGPU::G_BLOCK_ADDR:
2297 case AMDGPU::G_CONSTANT: {
2298 const ConstantInt *CI = Def->getOperand(1).getCImm();
2299 return CI->
getSExtValue() != TM.getNullPointerValue(AddrSpace);
2315 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2316 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2317 Intrinsic::amdgcn_addrspacecast_nonnull));
2321 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2322 :
MI.getOperand(1).getReg();
2323 LLT DstTy =
MRI.getType(Dst);
2324 LLT SrcTy =
MRI.getType(Src);
2335 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2336 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2347 B.buildExtract(Dst, Src, 0);
2348 MI.eraseFromParent();
2352 unsigned NullVal = TM.getNullPointerValue(DestAS);
2354 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2355 auto FlatNull =
B.buildConstant(SrcTy, 0);
2358 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2362 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2364 MI.eraseFromParent();
2371 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2378 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2382 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2388 castLocalOrPrivateToFlat(Dst);
2389 MI.eraseFromParent();
2393 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2395 auto SegmentNull =
B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2396 auto FlatNull =
B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2399 SegmentNull.getReg(0));
2401 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2403 MI.eraseFromParent();
2410 B.buildExtract(Dst, Src, 0);
2411 MI.eraseFromParent();
2419 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2420 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2421 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2422 MI.eraseFromParent();
2427 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2430 Ctx.
diagnose(InvalidAddrSpaceCast);
2432 MI.eraseFromParent();
2440 LLT Ty =
MRI.getType(Src);
2446 auto C1 =
B.buildFConstant(Ty, C1Val);
2447 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2450 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2451 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2453 auto C2 =
B.buildFConstant(Ty, C2Val);
2454 auto Fabs =
B.buildFAbs(Ty, Src);
2457 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2458 MI.eraseFromParent();
2476 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2478 const auto Zero =
B.buildFConstant(
S64, 0.0);
2479 const auto One =
B.buildFConstant(
S64, 1.0);
2482 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2483 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2486 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2487 MI.eraseFromParent();
2495 Register Src0Reg =
MI.getOperand(1).getReg();
2496 Register Src1Reg =
MI.getOperand(2).getReg();
2497 auto Flags =
MI.getFlags();
2498 LLT Ty =
MRI.getType(DstReg);
2500 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2501 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2502 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2503 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2504 MI.eraseFromParent();
2510 const unsigned FractBits = 52;
2511 const unsigned ExpBits = 11;
2514 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2515 auto Const1 =
B.buildConstant(
S32, ExpBits);
2517 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2519 .addUse(Const0.getReg(0))
2520 .addUse(Const1.getReg(0));
2522 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2536 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2543 const unsigned FractBits = 52;
2546 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2547 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2549 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2551 const auto Zero32 =
B.buildConstant(
S32, 0);
2554 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2556 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2557 auto Not =
B.buildNot(
S64, Shr);
2558 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2559 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2564 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2565 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2566 MI.eraseFromParent();
2582 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2583 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2585 if (
MRI.getType(Dst) ==
S64) {
2586 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2587 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2589 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2590 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2593 B.buildFAdd(Dst, LdExp, CvtLo);
2594 MI.eraseFromParent();
2600 auto One =
B.buildConstant(
S32, 1);
2604 auto ThirtyOne =
B.buildConstant(
S32, 31);
2605 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2606 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2607 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2608 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2609 .addUse(Unmerge.getReg(1));
2610 auto LS2 =
B.buildSub(
S32, LS, One);
2611 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2613 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2614 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2615 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2616 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2617 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2618 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2619 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2620 B.buildFLdexp(Dst, FVal, Scale);
2621 MI.eraseFromParent();
2638 const LLT SrcLT =
MRI.getType(Src);
2641 unsigned Flags =
MI.getFlags();
2652 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2660 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2661 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2665 K0 =
B.buildFConstant(
2666 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2667 K1 =
B.buildFConstant(
2668 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2670 K0 =
B.buildFConstant(
2671 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2672 K1 =
B.buildFConstant(
2673 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2676 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2677 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2678 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2681 :
B.buildFPTOUI(
S32, FloorMul);
2682 auto Lo =
B.buildFPTOUI(
S32, Fma);
2686 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2688 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2691 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2692 MI.eraseFromParent();
2702 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2703 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2726 LLT VecTy =
MRI.getType(Vec);
2739 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2740 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2741 B.buildIntToPtr(Dst, IntElt);
2743 MI.eraseFromParent();
2750 std::optional<ValueAndVReg> MaybeIdxVal =
2754 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2757 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2758 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2763 MI.eraseFromParent();
2778 LLT VecTy =
MRI.getType(Vec);
2792 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2793 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2794 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2796 B.buildIntToPtr(Dst, IntVecDest);
2797 MI.eraseFromParent();
2804 std::optional<ValueAndVReg> MaybeIdxVal =
2809 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2812 if (IdxVal < NumElts) {
2814 for (
unsigned i = 0; i < NumElts; ++i)
2815 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2816 B.buildUnmerge(SrcRegs, Vec);
2818 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2819 B.buildMergeLikeInstr(Dst, SrcRegs);
2824 MI.eraseFromParent();
2834 LLT Ty =
MRI.getType(DstReg);
2835 unsigned Flags =
MI.getFlags();
2840 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2841 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2842 .addUse(MulVal.getReg(0))
2846 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2849 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2853 MI.eraseFromParent();
2861 unsigned GAFlags)
const {
2862 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2890 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2901 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2902 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2905 B.buildExtract(DstReg, PCReg, 0);
2919 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2921 :
MRI.createGenericVirtualRegister(
S32);
2923 if (!
MRI.getRegClassOrNull(AddrLo))
2924 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2927 B.buildInstr(AMDGPU::S_MOV_B32)
2932 if (RequiresHighHalf) {
2934 "Must provide a 64-bit pointer type!");
2937 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2939 B.buildInstr(AMDGPU::S_MOV_B32)
2949 if (!
MRI.getRegClassOrNull(AddrDst))
2950 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2952 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2956 if (AddrDst != DstReg)
2957 B.buildCast(DstReg, AddrDst);
2958 }
else if (AddrLo != DstReg) {
2961 B.buildCast(DstReg, AddrLo);
2969 LLT Ty =
MRI.getType(DstReg);
2978 GV->
getName() !=
"llvm.amdgcn.module.lds") {
2981 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2991 B.buildUndef(DstReg);
2992 MI.eraseFromParent();
3012 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3016 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3017 B.buildIntToPtr(DstReg, Sz);
3018 MI.eraseFromParent();
3024 *cast<GlobalVariable>(GV)));
3025 MI.eraseFromParent();
3031 MI.eraseFromParent();
3039 MI.eraseFromParent();
3045 MI.eraseFromParent();
3050 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3063 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3064 B.buildExtract(DstReg, Load, 0);
3066 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3068 MI.eraseFromParent();
3086 LLT PtrTy =
MRI.getType(PtrReg);
3091 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3093 MI.getOperand(1).setReg(Cast.getReg(0));
3098 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3102 LLT ValTy =
MRI.getType(ValReg);
3124 if (WideMemSize == ValSize) {
3130 MI.setMemRefs(MF, {WideMMO});
3136 if (ValSize > WideMemSize)
3143 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3144 B.buildTrunc(ValReg, WideLoad).getReg(0);
3151 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3152 B.buildExtract(ValReg, WideLoad, 0);
3156 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3157 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3161 MI.eraseFromParent();
3174 Register DataReg =
MI.getOperand(0).getReg();
3175 LLT DataTy =
MRI.getType(DataReg);
3189 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3218 "this should not have been custom lowered");
3220 LLT ValTy =
MRI.getType(CmpVal);
3223 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3225 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3229 .setMemRefs(
MI.memoperands());
3231 MI.eraseFromParent();
3240 case TargetOpcode::G_INTRINSIC: {
3242 case Intrinsic::amdgcn_frexp_mant:
3250 case TargetOpcode::G_FFREXP: {
3255 case TargetOpcode::G_FPEXT: {
3279std::pair<Register, Register>
3281 unsigned Flags)
const {
3286 auto SmallestNormal =
B.buildFConstant(
3288 auto IsLtSmallestNormal =
3291 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3292 auto One =
B.buildFConstant(
F32, 1.0);
3294 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3295 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3297 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3310 LLT Ty =
B.getMRI()->getType(Dst);
3311 unsigned Flags =
MI.getFlags();
3316 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3317 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3318 .addUse(Ext.getReg(0))
3320 B.buildFPTrunc(Dst,
Log2, Flags);
3321 MI.eraseFromParent();
3329 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3332 MI.eraseFromParent();
3336 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3337 .addUse(ScaledInput)
3340 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3341 auto Zero =
B.buildFConstant(Ty, 0.0);
3343 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3344 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3346 MI.eraseFromParent();
3352 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3353 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3358 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3359 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3364 unsigned Flags =
MI.getFlags();
3365 const LLT Ty =
MRI.getType(
X);
3375 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3378 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3380 B.buildFPTrunc(Dst, LogVal);
3385 MI.eraseFromParent();
3394 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3399 const float c_log10 = 0x1.344134p-2f;
3400 const float cc_log10 = 0x1.09f79ep-26f;
3403 const float c_log = 0x1.62e42ep-1f;
3404 const float cc_log = 0x1.efa39ep-25f;
3406 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3407 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3409 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3410 auto NegR =
B.buildFNeg(Ty, R, Flags);
3411 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3412 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3413 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3416 const float ch_log10 = 0x1.344000p-2f;
3417 const float ct_log10 = 0x1.3509f6p-18f;
3420 const float ch_log = 0x1.62e000p-1f;
3421 const float ct_log = 0x1.0bfbe8p-15f;
3423 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3424 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3426 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3427 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3428 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3429 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3432 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3434 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3437 const bool IsFiniteOnly =
3441 if (!IsFiniteOnly) {
3444 auto Fabs =
B.buildFAbs(Ty,
Y);
3447 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3451 auto Zero =
B.buildFConstant(Ty, 0.0);
3453 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3454 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3455 B.buildFSub(Dst, R, Shift, Flags);
3457 B.buildCopy(Dst, R);
3460 MI.eraseFromParent();
3466 unsigned Flags)
const {
3467 const double Log2BaseInverted =
3470 LLT Ty =
B.getMRI()->getType(Dst);
3475 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3478 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3479 auto Zero =
B.buildFConstant(Ty, 0.0);
3481 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3482 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3485 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3487 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3488 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3496 ?
B.buildFLog2(Ty, Src, Flags)
3497 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3500 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3501 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3512 unsigned Flags =
MI.getFlags();
3513 LLT Ty =
B.getMRI()->getType(Dst);
3519 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3520 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3521 .addUse(Ext.getReg(0))
3523 B.buildFPTrunc(Dst,
Log2, Flags);
3524 MI.eraseFromParent();
3534 MI.eraseFromParent();
3542 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3544 RangeCheckConst, Flags);
3546 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3547 auto Zero =
B.buildFConstant(Ty, 0.0);
3548 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3549 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3551 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3552 .addUse(AddInput.getReg(0))
3555 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3556 auto One =
B.buildFConstant(Ty, 1.0);
3557 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3558 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3559 MI.eraseFromParent();
3565 LLT Ty =
B.getMRI()->getType(Dst);
3570 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3574 .addUse(
Mul.getReg(0))
3577 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3583 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3586 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3587 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3588 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3591 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3593 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3594 .addUse(ExpInput.getReg(0))
3597 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3598 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3599 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3607 const unsigned Flags =
MI.getFlags();
3610 LLT Ty =
MRI.getType(Dst);
3613 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3620 MI.eraseFromParent();
3628 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3631 B.buildFPTrunc(Dst, Lowered, Flags);
3632 MI.eraseFromParent();
3642 MI.eraseFromParent();
3670 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3675 const float cc_exp = 0x1.4ae0bep-26f;
3676 const float c_exp10 = 0x1.a934f0p+1f;
3677 const float cc_exp10 = 0x1.2f346ep-24f;
3679 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3680 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3681 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3682 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3684 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3685 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3687 const float ch_exp = 0x1.714000p+0f;
3688 const float cl_exp = 0x1.47652ap-12f;
3690 const float ch_exp10 = 0x1.a92000p+1f;
3691 const float cl_exp10 = 0x1.4f0978p-11f;
3693 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3694 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3695 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3697 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3698 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3700 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3701 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3704 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3705 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3708 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3711 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3712 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3715 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3716 .addUse(
A.getReg(0))
3718 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3720 auto UnderflowCheckConst =
3721 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3722 auto Zero =
B.buildFConstant(Ty, 0.0);
3726 R =
B.buildSelect(Ty, Underflow, Zero, R);
3731 auto OverflowCheckConst =
3732 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3737 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3740 B.buildCopy(Dst, R);
3741 MI.eraseFromParent();
3750 unsigned Flags =
MI.getFlags();
3751 LLT Ty =
B.getMRI()->getType(Dst);
3756 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3757 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3758 .addUse(Log.getReg(0))
3761 B.buildFExp2(Dst,
Mul, Flags);
3762 }
else if (Ty == F16) {
3764 auto Log =
B.buildFLog2(F16, Src0, Flags);
3765 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
3766 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
3767 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3768 .addUse(Ext0.getReg(0))
3769 .addUse(Ext1.getReg(0))
3771 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3775 MI.eraseFromParent();
3783 ModSrc = SrcFNeg->getOperand(1).getReg();
3785 ModSrc = SrcFAbs->getOperand(1).getReg();
3787 ModSrc = SrcFAbs->getOperand(1).getReg();
3798 Register OrigSrc =
MI.getOperand(1).getReg();
3799 unsigned Flags =
MI.getFlags();
3801 "this should not have been custom lowered");
3811 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
3823 B.buildFConstant(
F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3831 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3833 B.buildFMinNum(Min, Fract, Const, Flags);
3838 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
3841 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
3842 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3844 MI.eraseFromParent();
3860 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3862 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3863 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3866 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3867 B.buildBitcast(Dst,
Merge);
3869 MI.eraseFromParent();
3886 bool UsePartialMad64_32,
3887 bool SeparateOddAlignedProducts)
const {
3902 auto getZero32 = [&]() ->
Register {
3904 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3907 auto getZero64 = [&]() ->
Register {
3909 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3914 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3925 if (CarryIn.empty())
3928 bool HaveCarryOut =
true;
3930 if (CarryIn.size() == 1) {
3932 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3936 CarryAccum = getZero32();
3938 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3939 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3941 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3946 LocalAccum = getZero32();
3947 HaveCarryOut =
false;
3952 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3953 LocalAccum =
Add.getReg(0);
3967 auto buildMadChain =
3970 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3971 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3978 if (LocalAccum.size() == 1 &&
3979 (!UsePartialMad64_32 || !CarryIn.empty())) {
3982 unsigned j1 = DstIndex - j0;
3983 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3987 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3989 LocalAccum[0] =
Mul.getReg(0);
3991 if (CarryIn.empty()) {
3992 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3995 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4001 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4005 if (j0 <= DstIndex) {
4006 bool HaveSmallAccum =
false;
4009 if (LocalAccum[0]) {
4010 if (LocalAccum.size() == 1) {
4011 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4012 HaveSmallAccum =
true;
4013 }
else if (LocalAccum[1]) {
4014 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4015 HaveSmallAccum =
false;
4017 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4018 HaveSmallAccum =
true;
4021 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4023 HaveSmallAccum =
true;
4027 unsigned j1 = DstIndex - j0;
4028 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4032 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4033 {Src0[j0], Src1[j1], Tmp});
4034 Tmp = Mad.getReg(0);
4035 if (!HaveSmallAccum)
4036 CarryOut.push_back(Mad.getReg(1));
4037 HaveSmallAccum =
false;
4040 }
while (j0 <= DstIndex);
4042 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4043 LocalAccum[0] = Unmerge.getReg(0);
4044 if (LocalAccum.size() > 1)
4045 LocalAccum[1] = Unmerge.getReg(1);
4072 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4073 Carry OddCarryIn = std::move(OddCarry);
4074 Carry EvenCarryIn = std::move(EvenCarry);
4079 if (2 * i < Accum.
size()) {
4080 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4081 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4086 if (!SeparateOddAlignedProducts) {
4087 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4088 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4090 bool IsHighest = 2 * i >= Accum.
size();
4094 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4100 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4102 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4104 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4107 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4110 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4111 Lo->getOperand(1).getReg());
4112 Accum[2 * i] =
Hi.getReg(0);
4113 SeparateOddCarry =
Hi.getReg(1);
4120 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4121 EvenCarryIn.push_back(CarryOut);
4123 if (2 * i < Accum.
size()) {
4124 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4125 OddCarry.push_back(CarryOut);
4138 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4147 LLT Ty =
MRI.getType(DstReg);
4151 unsigned NumParts =
Size / 32;
4167 for (
unsigned i = 0; i < NumParts; ++i) {
4171 B.buildUnmerge(Src0Parts, Src0);
4172 B.buildUnmerge(Src1Parts, Src1);
4175 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4176 SeparateOddAlignedProducts);
4178 B.buildMergeLikeInstr(DstReg, AccumRegs);
4179 MI.eraseFromParent();
4191 LLT DstTy =
MRI.getType(Dst);
4192 LLT SrcTy =
MRI.getType(Src);
4194 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4195 ? AMDGPU::G_AMDGPU_FFBH_U32
4196 : AMDGPU::G_AMDGPU_FFBL_B32;
4197 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4200 MI.eraseFromParent();
4209 LLT SrcTy =
MRI.getType(Src);
4214 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4215 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4216 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4217 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4218 B.buildTrunc(Dst, Ctlz);
4219 MI.eraseFromParent();
4225 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4228 return ConstVal && *ConstVal == -1;
4235 Register CondDef =
MI.getOperand(0).getReg();
4236 if (!
MRI.hasOneNonDBGUse(CondDef))
4244 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4250 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4259 if (Next == Parent->
end()) {
4263 UncondBrTarget = &*NextMBB;
4265 if (Next->getOpcode() != AMDGPU::G_BR)
4283 *ArgRC,
B.getDebugLoc(), ArgTy);
4287 const unsigned Mask = Arg->
getMask();
4288 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4295 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4296 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4299 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4301 B.buildCopy(DstReg, LiveIn);
4330 Arg = &WorkGroupIDX;
4331 ArgRC = &AMDGPU::SReg_32RegClass;
4335 Arg = &WorkGroupIDY;
4336 ArgRC = &AMDGPU::SReg_32RegClass;
4340 Arg = &WorkGroupIDZ;
4341 ArgRC = &AMDGPU::SReg_32RegClass;
4356 B.buildConstant(DstReg, 0);
4362 B.buildUndef(DstReg);
4366 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4377 MI.eraseFromParent();
4383 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4384 MI.eraseFromParent();
4405 B.buildUndef(DstReg);
4406 MI.eraseFromParent();
4410 if (Arg->isMasked()) {
4424 MI.eraseFromParent();
4431 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4441 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4449 Align Alignment)
const {
4453 "unexpected kernarg parameter type");
4457 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4460 MI.eraseFromParent();
4468 LLT DstTy =
MRI.getType(Dst);