37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const std::initializer_list<LLT> FPTypesPK16_64 = {
S32,
S64,
S16,
V2S16,
738 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
761 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
763 if (ST.hasPackedU64Ops()) {
766 .clampMaxNumElementsStrict(0,
S16, 2)
772 }
else if (ST.hasScalarAddSub64()) {
775 .clampMaxNumElementsStrict(0,
S16, 2)
783 .clampMaxNumElementsStrict(0,
S16, 2)
790 if (ST.hasScalarSMulU64()) {
793 .clampMaxNumElementsStrict(0,
S16, 2)
801 .clampMaxNumElementsStrict(0,
S16, 2)
811 .minScalarOrElt(0,
S16)
816 }
else if (ST.has16BitInsts()) {
850 .widenScalarToNextMultipleOf(0, 32)
860 if (ST.hasMad64_32())
865 if (ST.hasIntClamp()) {
888 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
898 if (ST.hasVOP3PInsts()) {
900 .clampMaxNumElements(0,
S8, 2)
921 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
933 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
940 .clampScalar(0,
S16,
S64);
973 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
974 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
981 if (ST.has16BitInsts()) {
982 if (ST.hasVOP3PInsts())
985 FPOpActions.legalFor({
S16});
987 TrigActions.customFor({
S16});
988 FDIVActions.customFor({
S16});
991 if (ST.hasPackedFP32Ops()) {
992 FPOpActions.legalFor({
V2S32});
993 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
996 if (ST.hasPackedFP64Ops()) {
997 FPOpActions.legalFor({
V2S64});
998 FPOpActions.clampMaxNumElementsStrict(0,
S64, 2);
1001 if (ST.hasPackedFP64Ops()) {
1002 FPOpActions.legalFor({
V2S64});
1003 FPOpActions.clampMaxNumElementsStrict(0,
S64, 2);
1006 auto &MinNumMaxNumIeee =
1009 if (ST.hasVOP3PInsts()) {
1010 MinNumMaxNumIeee.legalFor(FPTypesPK16)
1012 .clampMaxNumElements(0,
S16, 2)
1013 .clampScalar(0,
S16,
S64)
1015 }
else if (ST.has16BitInsts()) {
1016 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
1018 MinNumMaxNumIeee.legalFor(FPTypesBase)
1019 .clampScalar(0,
S32,
S64)
1024 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1026 if (ST.hasPackedFP64Ops()) {
1027 MinNumMaxNum.customFor(FPTypesPK16_64)
1029 .clampMaxNumElements(0,
S16, 2)
1030 .clampMaxNumElements(0,
S64, 2)
1031 .clampScalar(0,
S16,
S64)
1033 }
else if (ST.hasVOP3PInsts()) {
1034 MinNumMaxNum.customFor(FPTypesPK16)
1036 .clampMaxNumElements(0,
S16, 2)
1037 .clampScalar(0,
S16,
S64)
1039 }
else if (ST.has16BitInsts()) {
1040 MinNumMaxNum.customFor(FPTypes16)
1041 .clampScalar(0,
S16,
S64)
1044 MinNumMaxNum.customFor(FPTypesBase)
1045 .clampScalar(0,
S32,
S64)
1049 if (ST.hasVOP3PInsts())
1066 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1068 if (ST.hasPackedFP32Ops())
1072 if (ST.has16BitInsts()) {
1106 if (ST.hasFractBug()) {
1140 if (ST.hasCvtPkF16F32Inst()) {
1142 .clampMaxNumElements(0,
S16, 2);
1146 FPTruncActions.scalarize(0).lower();
1154 if (ST.has16BitInsts()) {
1168 if (ST.hasPackedFP32Ops())
1178 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1179 FMad.customFor({
S32,
S16});
1180 else if (ST.hasMadMacF32Insts())
1181 FMad.customFor({
S32});
1182 else if (ST.hasMadF16())
1183 FMad.customFor({
S16});
1188 if (ST.has16BitInsts()) {
1191 FRem.minScalar(0,
S32)
1200 .clampMaxNumElements(0,
S16, 2)
1219 if (ST.has16BitInsts())
1230 if (ST.has16BitInsts())
1243 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1247 if (
ST.has16BitInsts())
1257 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1258 .clampScalar(0,
S16,
S64)
1262 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1268 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1272 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1273 .clampScalar(0,
S16,
S64)
1277 if (
ST.has16BitInsts()) {
1278 getActionDefinitionsBuilder(
1279 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1281 .clampScalar(0,
S16,
S64)
1284 getActionDefinitionsBuilder(
1285 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1287 .clampScalar(0,
S32,
S64)
1290 getActionDefinitionsBuilder(
1291 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1294 .clampScalar(0,
S32,
S64)
1298 getActionDefinitionsBuilder(G_PTR_ADD)
1304 getActionDefinitionsBuilder(G_PTRMASK)
1306 .scalarSameSizeAs(1, 0)
1310 getActionDefinitionsBuilder(G_ICMP)
1322 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1323 .legalForCartesianProduct(
1324 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1325 if (
ST.has16BitInsts()) {
1326 CmpBuilder.legalFor({{
S1,
S16}});
1337 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1339 if (
ST.hasSALUFloatInsts())
1348 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1349 if (
ST.has16BitInsts())
1350 ExpOps.customFor({{
S32}, {
S16}});
1352 ExpOps.customFor({
S32});
1353 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1356 getActionDefinitionsBuilder(G_FPOWI)
1357 .clampScalar(0, MinScalarFPTy,
S32)
1360 getActionDefinitionsBuilder(G_FLOG2)
1361 .legalFor(
ST.has16BitInsts(), {S16})
1366 getActionDefinitionsBuilder(G_FEXP2)
1367 .legalFor(
ST.has16BitInsts(), {S16})
1373 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1375 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1379 getActionDefinitionsBuilder(G_CTPOP)
1381 .clampScalar(0,
S32,
S32)
1382 .widenScalarToNextPow2(1, 32)
1383 .clampScalar(1,
S32,
S64)
1385 .widenScalarToNextPow2(0, 32);
1388 if (
ST.has16BitInsts())
1389 getActionDefinitionsBuilder(G_IS_FPCLASS)
1390 .legalForCartesianProduct({
S1}, FPTypes16)
1391 .widenScalarToNextPow2(1)
1395 getActionDefinitionsBuilder(G_IS_FPCLASS)
1396 .legalForCartesianProduct({
S1}, FPTypesBase)
1397 .lowerFor({
S1,
S16})
1398 .widenScalarToNextPow2(1)
1405 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1407 .clampScalar(0,
S32,
S32)
1408 .clampScalar(1,
S32,
S64)
1409 .widenScalarToNextPow2(0, 32)
1410 .widenScalarToNextPow2(1, 32)
1414 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1417 .clampScalar(0,
S32,
S32)
1418 .clampScalar(1,
S32,
S64)
1420 .widenScalarToNextPow2(0, 32)
1421 .widenScalarToNextPow2(1, 32);
1423 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1425 .clampScalar(0,
S32,
S32)
1426 .clampScalar(1,
S32,
S64)
1428 .widenScalarToNextPow2(0, 32)
1429 .widenScalarToNextPow2(1, 32);
1431 getActionDefinitionsBuilder(G_CTLS)
1434 .clampScalar(0,
S32,
S32)
1435 .clampScalar(1,
S32,
S32);
1439 getActionDefinitionsBuilder(G_BITREVERSE)
1441 .clampScalar(0,
S32,
S64)
1443 .widenScalarToNextPow2(0);
1445 if (
ST.has16BitInsts()) {
1446 getActionDefinitionsBuilder(G_BSWAP)
1448 .clampMaxNumElementsStrict(0,
S16, 2)
1451 .widenScalarToNextPow2(0)
1452 .clampScalar(0,
S16,
S32)
1455 if (
ST.hasVOP3PInsts()) {
1456 getActionDefinitionsBuilder(G_ABS)
1458 .clampMaxNumElements(0,
S16, 2)
1460 .widenScalarToNextPow2(0)
1463 if (
ST.hasMinMaxI64Insts()) {
1464 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1466 .clampMaxNumElements(0,
S16, 2)
1468 .widenScalarToNextPow2(0)
1472 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1474 .clampMaxNumElements(0,
S16, 2)
1476 .widenScalarToNextPow2(0)
1481 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1483 .widenScalarToNextPow2(0)
1490 getActionDefinitionsBuilder(G_BSWAP)
1495 .widenScalarToNextPow2(0)
1500 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1503 .widenScalarToNextPow2(0)
1508 getActionDefinitionsBuilder(G_INTTOPTR)
1510 .legalForCartesianProduct(AddrSpaces64, {
S64})
1511 .legalForCartesianProduct(AddrSpaces32, {
S32})
1524 getActionDefinitionsBuilder(G_PTRTOINT)
1526 .legalForCartesianProduct(AddrSpaces64, {
S64})
1527 .legalForCartesianProduct(AddrSpaces32, {
S32})
1540 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1544 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1545 bool IsLoad) ->
bool {
1549 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1563 unsigned NumRegs = (MemSize + 31) / 32;
1565 if (!
ST.hasDwordx3LoadStores())
1576 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1577 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1578 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1584 for (
unsigned Op : {G_LOAD, G_STORE}) {
1585 const bool IsStore =
Op == G_STORE;
1587 auto &Actions = getActionDefinitionsBuilder(
Op);
1590 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1593 {
S64, GlobalPtr,
S64, GlobalAlign32},
1596 {
S32, GlobalPtr,
S8, GlobalAlign8},
1597 {
S32, GlobalPtr,
S16, GlobalAlign16},
1599 {
S32, LocalPtr,
S32, 32},
1600 {
S64, LocalPtr,
S64, 32},
1602 {
S32, LocalPtr,
S8, 8},
1603 {
S32, LocalPtr,
S16, 16},
1606 {
S32, PrivatePtr,
S32, 32},
1607 {
S32, PrivatePtr,
S8, 8},
1608 {
S32, PrivatePtr,
S16, 16},
1611 {
S32, ConstantPtr,
S32, GlobalAlign32},
1614 {
S64, ConstantPtr,
S64, GlobalAlign32},
1615 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1617 Actions.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1618 {{S16, GlobalPtr, S8, GlobalAlign8},
1619 {S16, GlobalPtr, S16, GlobalAlign16},
1620 {S16, LocalPtr, S8, 8},
1621 {S16, LocalPtr, S16, 16},
1622 {S16, PrivatePtr, S8, 8},
1623 {S16, PrivatePtr, S16, 16}});
1633 Actions.unsupportedIf(
1634 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1648 Actions.customIf(
typeIs(1, Constant32Ptr));
1674 return !Query.
Types[0].isVector() &&
1675 needToSplitMemOp(Query,
Op == G_LOAD);
1677 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1682 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1685 if (DstSize > MemSize)
1691 if (MemSize > MaxSize)
1699 return Query.
Types[0].isVector() &&
1700 needToSplitMemOp(Query,
Op == G_LOAD);
1702 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1716 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1717 if (MemSize > MaxSize) {
1721 if (MaxSize % EltSize == 0) {
1727 unsigned NumPieces = MemSize / MaxSize;
1731 if (NumPieces == 1 || NumPieces >= NumElts ||
1732 NumElts % NumPieces != 0)
1733 return std::pair(0, EltTy);
1741 return std::pair(0, EltTy);
1756 return std::pair(0, EltTy);
1761 .widenScalarToNextPow2(0)
1768 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1769 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1770 {
S32, GlobalPtr,
S16, 2 * 8},
1771 {
S32, LocalPtr,
S8, 8},
1772 {
S32, LocalPtr,
S16, 16},
1773 {
S32, PrivatePtr,
S8, 8},
1774 {
S32, PrivatePtr,
S16, 16},
1775 {
S32, ConstantPtr,
S8, 8},
1776 {
S32, ConstantPtr,
S16, 2 * 8}})
1777 .legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1778 {{S16, GlobalPtr, S8, GlobalAlign8},
1779 {S16, LocalPtr, S8, GlobalAlign8},
1780 {S16, PrivatePtr, S8, GlobalAlign8},
1781 {S16, ConstantPtr, S8, GlobalAlign8}})
1786 if (
ST.hasFlatAddressSpace()) {
1787 ExtLoads.legalForTypesWithMemDesc(
1788 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1790 ExtLoads.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1791 {{S16, FlatPtr, S8, GlobalAlign8}});
1799 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1801 ExtLoads.narrowScalarIf(
1808 ExtLoads.clampScalar(0,
S32,
S32)
1809 .widenScalarToNextPow2(0)
1812 auto &Atomics = getActionDefinitionsBuilder(
1813 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1814 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1815 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1816 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1817 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1818 {
S64, GlobalPtr}, {
S64, LocalPtr},
1819 {
S32, RegionPtr}, {
S64, RegionPtr}});
1820 if (
ST.hasFlatAddressSpace()) {
1821 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1825 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1826 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1827 if (
ST.hasFlatAddressSpace()) {
1828 Atomics32.legalFor({{
S32, FlatPtr}});
1832 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1833 if (
ST.hasLDSFPAtomicAddF32()) {
1834 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1835 if (
ST.hasLdsAtomicAddF64())
1836 Atomic.legalFor({{
S64, LocalPtr}});
1837 if (
ST.hasAtomicDsPkAdd16Insts())
1838 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1840 if (
ST.hasAtomicFaddInsts())
1841 Atomic.legalFor({{
S32, GlobalPtr}});
1842 if (
ST.hasFlatAtomicFaddF32Inst())
1843 Atomic.legalFor({{
S32, FlatPtr}});
1845 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1856 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1857 ST.hasAtomicBufferGlobalPkAddF16Insts())
1858 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1859 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1860 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1861 if (
ST.hasAtomicFlatPkAdd16Insts())
1862 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1867 auto &AtomicFMinFMax =
1868 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1869 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1871 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1872 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1873 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1874 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1875 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1876 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1877 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1878 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1882 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1883 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1884 {
S32, FlatPtr}, {
S64, FlatPtr}})
1885 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1886 {
S32, RegionPtr}, {
S64, RegionPtr}});
1890 getActionDefinitionsBuilder(G_SELECT)
1892 LocalPtr, FlatPtr, PrivatePtr,
1896 .clampScalar(0,
S16,
S64)
1900 .clampMaxNumElements(0,
S32, 2)
1901 .clampMaxNumElements(0, LocalPtr, 2)
1902 .clampMaxNumElements(0, PrivatePtr, 2)
1904 .widenScalarToNextPow2(0)
1909 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1911 if (
ST.has16BitInsts()) {
1912 if (
ST.hasVOP3PInsts()) {
1914 .clampMaxNumElements(0,
S16, 2);
1916 Shifts.legalFor({{
S16,
S16}});
1919 Shifts.widenScalarIf(
1924 const LLT AmountTy = Query.
Types[1];
1929 Shifts.clampScalar(1,
S32,
S32);
1930 Shifts.widenScalarToNextPow2(0, 16);
1931 Shifts.clampScalar(0,
S16,
S64);
1933 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1941 Shifts.clampScalar(1,
S32,
S32);
1942 Shifts.widenScalarToNextPow2(0, 32);
1943 Shifts.clampScalar(0,
S32,
S64);
1945 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1950 Shifts.scalarize(0);
1952 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1953 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1954 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1955 unsigned IdxTypeIdx = 2;
1957 getActionDefinitionsBuilder(
Op)
1959 const LLT EltTy = Query.
Types[EltTypeIdx];
1960 const LLT VecTy = Query.
Types[VecTypeIdx];
1961 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1963 const bool isLegalVecType =
1973 return (EltSize == 32 || EltSize == 64) &&
1989 const LLT EltTy = Query.
Types[EltTypeIdx];
1990 const LLT VecTy = Query.
Types[VecTypeIdx];
1994 const unsigned TargetEltSize =
1995 DstEltSize % 64 == 0 ? 64 : 32;
1996 return std::pair(VecTypeIdx,
2000 .clampScalar(EltTypeIdx,
S32,
S64)
2001 .clampScalar(VecTypeIdx,
S32,
S64)
2002 .clampScalar(IdxTypeIdx,
S32,
S32)
2003 .clampMaxNumElements(VecTypeIdx,
S32, 32)
2012 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
2014 const LLT &EltTy = Query.
Types[1].getElementType();
2015 return Query.
Types[0] != EltTy;
2018 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
2019 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
2020 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
2021 getActionDefinitionsBuilder(
Op)
2024 const LLT BigTy = Query.
Types[BigTyIdx];
2030 const LLT LitTy = Query.
Types[LitTyIdx];
2035 .widenScalarToNextPow2(BigTyIdx, 32)
2043 const LLT BigTy = Query.
Types[BigTyIdx];
2044 const LLT LitTy = Query.
Types[LitTyIdx];
2052 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2061 if (
ST.hasScalarPackInsts()) {
2064 .minScalarOrElt(0,
S16)
2067 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2071 BuildVector.customFor({
V2S16,
S16});
2072 BuildVector.minScalarOrElt(0,
S32);
2074 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2082 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2084 .clampMaxNumElements(0,
S32, 32)
2085 .clampMaxNumElements(1,
S16, 2)
2086 .clampMaxNumElements(0,
S16, 64);
2088 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2091 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2092 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2093 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2095 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2096 const LLT Ty = Query.
Types[TypeIdx];
2108 getActionDefinitionsBuilder(
Op)
2112 const LLT BigTy = Query.
Types[BigTyIdx];
2118 .widenScalarToNextPow2(LitTyIdx, 16)
2127 .clampScalar(LitTyIdx,
S32,
S512)
2128 .widenScalarToNextPow2(LitTyIdx, 32)
2132 return notValidElt(Query, LitTyIdx);
2137 return notValidElt(Query, BigTyIdx);
2142 if (
Op == G_MERGE_VALUES) {
2143 Builder.widenScalarIf(
2146 const LLT Ty = Query.
Types[LitTyIdx];
2152 Builder.widenScalarIf(
2154 const LLT Ty = Query.
Types[BigTyIdx];
2160 const LLT &Ty = Query.
Types[BigTyIdx];
2162 if (NewSizeInBits >= 256) {
2164 if (RoundedTo < NewSizeInBits)
2165 NewSizeInBits = RoundedTo;
2167 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2176 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2177 .legalFor({{
S32}, {
S64}})
2178 .clampScalar(0,
S32,
S64);
2180 if (
ST.hasVOP3PInsts()) {
2181 SextInReg.lowerFor({{
V2S16}})
2185 .clampMaxNumElementsStrict(0,
S16, 2);
2186 }
else if (
ST.has16BitInsts()) {
2187 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2191 SextInReg.lowerFor({{
S32}, {
S64}});
2196 .clampScalar(0,
S32,
S64)
2199 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2203 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2204 FSHRActionDefs.legalFor({{
S32,
S32}})
2205 .clampMaxNumElementsStrict(0,
S16, 2);
2206 if (
ST.hasVOP3PInsts())
2208 FSHRActionDefs.scalarize(0).lower();
2210 if (
ST.hasVOP3PInsts()) {
2211 getActionDefinitionsBuilder(G_FSHL)
2213 .clampMaxNumElementsStrict(0,
S16, 2)
2217 getActionDefinitionsBuilder(G_FSHL)
2222 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2225 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2227 getActionDefinitionsBuilder(G_FENCE)
2230 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2235 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2237 .clampScalar(1,
S32,
S32)
2238 .clampScalar(0,
S32,
S64)
2239 .widenScalarToNextPow2(0)
2242 getActionDefinitionsBuilder(
2246 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2247 G_READ_REGISTER, G_WRITE_REGISTER,
2252 if (
ST.hasIEEEMinimumMaximumInsts()) {
2253 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2254 .legalFor(FPTypesPK16)
2255 .clampMaxNumElements(0,
S16, 2)
2257 }
else if (
ST.hasVOP3PInsts()) {
2258 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2260 .clampMaxNumElementsStrict(0,
S16, 2)
2264 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2266 .clampScalar(0,
S32,
S64)
2270 getActionDefinitionsBuilder(
2271 {G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET, G_MEMSET_INLINE})
2274 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2276 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2277 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2278 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2281 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2283 getActionDefinitionsBuilder(
2284 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2285 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2286 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2287 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2292 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2293 G_INTRINSIC_CONVERGENT,
2294 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2297 getLegacyLegalizerInfo().computeTables();
2307 switch (
MI.getOpcode()) {
2308 case TargetOpcode::G_ADDRSPACE_CAST:
2310 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2312 case TargetOpcode::G_FCEIL:
2314 case TargetOpcode::G_FREM:
2316 case TargetOpcode::G_INTRINSIC_TRUNC:
2318 case TargetOpcode::G_SITOFP:
2320 case TargetOpcode::G_UITOFP:
2322 case TargetOpcode::G_FPTOSI:
2324 case TargetOpcode::G_FPTOUI:
2326 case TargetOpcode::G_FMINNUM:
2327 case TargetOpcode::G_FMAXNUM:
2328 case TargetOpcode::G_FMINIMUMNUM:
2329 case TargetOpcode::G_FMAXIMUMNUM:
2331 case TargetOpcode::G_EXTRACT:
2333 case TargetOpcode::G_INSERT:
2335 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2337 case TargetOpcode::G_INSERT_VECTOR_ELT:
2339 case TargetOpcode::G_FSIN:
2340 case TargetOpcode::G_FCOS:
2342 case TargetOpcode::G_GLOBAL_VALUE:
2344 case TargetOpcode::G_LOAD:
2345 case TargetOpcode::G_SEXTLOAD:
2346 case TargetOpcode::G_ZEXTLOAD:
2348 case TargetOpcode::G_STORE:
2350 case TargetOpcode::G_FMAD:
2352 case TargetOpcode::G_FDIV:
2354 case TargetOpcode::G_FFREXP:
2356 case TargetOpcode::G_FSQRT:
2358 case TargetOpcode::G_UDIV:
2359 case TargetOpcode::G_UREM:
2360 case TargetOpcode::G_UDIVREM:
2362 case TargetOpcode::G_SDIV:
2363 case TargetOpcode::G_SREM:
2364 case TargetOpcode::G_SDIVREM:
2366 case TargetOpcode::G_ATOMIC_CMPXCHG:
2368 case TargetOpcode::G_FLOG2:
2370 case TargetOpcode::G_FLOG:
2371 case TargetOpcode::G_FLOG10:
2373 case TargetOpcode::G_FEXP2:
2375 case TargetOpcode::G_FEXP:
2376 case TargetOpcode::G_FEXP10:
2378 case TargetOpcode::G_FPOW:
2380 case TargetOpcode::G_FFLOOR:
2382 case TargetOpcode::G_BUILD_VECTOR:
2383 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2385 case TargetOpcode::G_MUL:
2387 case TargetOpcode::G_CTLZ:
2388 case TargetOpcode::G_CTTZ:
2390 case TargetOpcode::G_CTLS:
2392 case TargetOpcode::G_CTLZ_ZERO_POISON:
2394 case TargetOpcode::G_STACKSAVE:
2396 case TargetOpcode::G_GET_FPENV:
2398 case TargetOpcode::G_SET_FPENV:
2400 case TargetOpcode::G_TRAP:
2402 case TargetOpcode::G_DEBUGTRAP:
2422 if (ST.hasApertureRegs()) {
2427 ? AMDGPU::SRC_SHARED_BASE
2428 : AMDGPU::SRC_PRIVATE_BASE;
2429 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2430 !ST.hasGloballyAddressableScratch()) &&
2431 "Cannot use src_private_base with globally addressable scratch!");
2434 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2435 return B.buildUnmerge(
S32, Dst).getReg(1);
2450 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2466 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2469 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2491 B.buildObjectPtrOffset(
2493 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2494 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2502 switch (Def->getOpcode()) {
2503 case AMDGPU::G_FRAME_INDEX:
2504 case AMDGPU::G_GLOBAL_VALUE:
2505 case AMDGPU::G_BLOCK_ADDR:
2507 case AMDGPU::G_CONSTANT: {
2508 const ConstantInt *CI = Def->getOperand(1).getCImm();
2525 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2527 Intrinsic::amdgcn_addrspacecast_nonnull));
2532 :
MI.getOperand(1).getReg();
2536 unsigned SrcAS = SrcTy.getAddressSpace();
2546 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2553 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2555 ST.hasGloballyAddressableScratch()) {
2559 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2561 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2562 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2564 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2566 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2570 return B.buildExtract(Dst, Src, 0).getReg(0);
2576 castFlatToLocalOrPrivate(Dst);
2577 MI.eraseFromParent();
2583 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2584 auto FlatNull =
B.buildConstant(SrcTy, 0);
2587 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2591 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2593 MI.eraseFromParent();
2600 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2603 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2606 ST.hasGloballyAddressableScratch()) {
2611 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2615 if (ST.isWave64()) {
2616 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2622 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2623 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2625 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2629 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2630 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2632 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2633 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2642 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2648 castLocalOrPrivateToFlat(Dst);
2649 MI.eraseFromParent();
2653 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2660 SegmentNull.getReg(0));
2662 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2664 MI.eraseFromParent();
2669 SrcTy.getSizeInBits() == 64) {
2671 B.buildExtract(Dst, Src, 0);
2672 MI.eraseFromParent();
2679 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2680 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2681 if (AddrHiVal == 0) {
2683 B.buildIntToPtr(Dst, Zext);
2685 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2686 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2689 MI.eraseFromParent();
2696 MI.eraseFromParent();
2705 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2710 auto C1 =
B.buildFConstant(Ty, C1Val);
2711 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2714 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2715 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2717 auto C2 =
B.buildFConstant(Ty, C2Val);
2718 auto Fabs =
B.buildFAbs(Ty, Src);
2721 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2722 MI.eraseFromParent();
2740 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2742 const auto Zero =
B.buildFConstant(
S64, 0.0);
2743 const auto One =
B.buildFConstant(
S64, 1.0);
2746 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2747 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2750 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2751 MI.eraseFromParent();
2759 Register Src0Reg =
MI.getOperand(1).getReg();
2760 Register Src1Reg =
MI.getOperand(2).getReg();
2761 auto Flags =
MI.getFlags();
2764 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2765 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2766 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2767 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2768 MI.eraseFromParent();
2774 const unsigned FractBits = 52;
2775 const unsigned ExpBits = 11;
2778 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2779 auto Const1 =
B.buildConstant(
S32, ExpBits);
2781 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2783 .addUse(Const0.getReg(0))
2784 .addUse(Const1.getReg(0));
2786 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2800 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2807 const unsigned FractBits = 52;
2810 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2811 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2813 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2815 const auto Zero32 =
B.buildConstant(
S32, 0);
2818 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2820 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2821 auto Not =
B.buildNot(
S64, Shr);
2822 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2823 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2828 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2829 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2830 MI.eraseFromParent();
2846 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2847 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2850 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2851 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2853 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2854 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2857 B.buildFAdd(Dst, LdExp, CvtLo);
2858 MI.eraseFromParent();
2864 auto One =
B.buildConstant(
S32, 1);
2868 auto ThirtyOne =
B.buildConstant(
S32, 31);
2869 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2870 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2871 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2872 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2873 .addUse(Unmerge.getReg(1));
2874 auto LS2 =
B.buildSub(
S32, LS, One);
2875 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2877 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2878 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2879 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2880 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2881 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2882 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2883 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2884 B.buildFLdexp(Dst, FVal, Scale);
2885 MI.eraseFromParent();
2905 unsigned Flags =
MI.getFlags();
2916 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2924 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2925 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2929 K0 =
B.buildFConstant(
2931 K1 =
B.buildFConstant(
2934 K0 =
B.buildFConstant(
2936 K1 =
B.buildFConstant(
2940 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2941 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2942 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2945 :
B.buildFPTOUI(
S32, FloorMul);
2946 auto Lo =
B.buildFPTOUI(
S32, Fma);
2950 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2952 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2955 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2956 MI.eraseFromParent();
2988 unsigned StartIdx =
Offset / 32;
2990 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2992 if (DstCount == 1) {
2994 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2999 for (
unsigned I = 0;
I < DstCount; ++
I)
3000 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
3001 B.buildMergeLikeInstr(DstReg, MergeVec);
3004 MI.eraseFromParent();
3014 Register InsertSrc =
MI.getOperand(2).getReg();
3023 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
3027 unsigned DstCount = DstSize / 32;
3028 unsigned InsertCount = InsertSize / 32;
3029 unsigned StartIdx =
Offset / 32;
3031 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
3034 for (
unsigned I = 0;
I < StartIdx; ++
I)
3037 if (InsertCount == 1) {
3041 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
3044 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
3045 for (
unsigned I = 0;
I < InsertCount; ++
I)
3049 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
3052 B.buildMergeLikeInstr(DstReg, MergeVec);
3054 MI.eraseFromParent();
3081 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3082 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3083 B.buildIntToPtr(Dst, IntElt);
3085 MI.eraseFromParent();
3092 std::optional<ValueAndVReg> MaybeIdxVal =
3096 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3099 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3100 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3105 MI.eraseFromParent();
3134 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3135 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3136 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3138 B.buildIntToPtr(Dst, IntVecDest);
3139 MI.eraseFromParent();
3146 std::optional<ValueAndVReg> MaybeIdxVal =
3151 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3154 if (IdxVal < NumElts) {
3156 for (
unsigned i = 0; i < NumElts; ++i)
3158 B.buildUnmerge(SrcRegs, Vec);
3160 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3161 B.buildMergeLikeInstr(Dst, SrcRegs);
3166 MI.eraseFromParent();
3177 unsigned Flags =
MI.getFlags();
3181 if (ST.hasTrigReducedRange()) {
3182 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3183 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3184 .addUse(MulVal.getReg(0))
3188 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3191 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3195 MI.eraseFromParent();
3203 unsigned GAFlags)
const {
3232 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3234 if (ST.has64BitLiterals()) {
3238 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3242 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3251 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3252 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3255 B.buildExtract(DstReg, PCReg, 0);
3265 if (RequiresHighHalf && ST.has64BitLiterals()) {
3267 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3268 B.buildInstr(AMDGPU::S_MOV_B64)
3283 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3286 B.buildInstr(AMDGPU::S_MOV_B32)
3291 if (RequiresHighHalf) {
3293 "Must provide a 64-bit pointer type!");
3296 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3298 B.buildInstr(AMDGPU::S_MOV_B32)
3309 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3311 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3315 if (AddrDst != DstReg)
3316 B.buildCast(DstReg, AddrDst);
3317 }
else if (AddrLo != DstReg) {
3320 B.buildCast(DstReg, AddrLo);
3329 unsigned AS = Ty.getAddressSpace();
3337 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3341 Fn,
"local memory global used by non-kernel function",
3350 B.buildUndef(DstReg);
3351 MI.eraseFromParent();
3375 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3376 B.buildIntToPtr(DstReg, Sz);
3377 MI.eraseFromParent();
3383 MI.eraseFromParent();
3387 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3389 MI.eraseFromParent();
3397 MI.eraseFromParent();
3403 MI.eraseFromParent();
3419 if (Ty.getSizeInBits() == 32) {
3421 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3422 B.buildExtract(DstReg, Load, 0);
3424 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3426 MI.eraseFromParent();
3449 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3451 MI.getOperand(1).setReg(Cast.getReg(0));
3456 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3482 if (WideMemSize == ValSize) {
3488 MI.setMemRefs(MF, {WideMMO});
3494 if (ValSize > WideMemSize)
3501 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3502 B.buildTrunc(ValReg, WideLoad).getReg(0);
3509 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3510 B.buildExtract(ValReg, WideLoad, 0);
3514 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3515 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3519 MI.eraseFromParent();
3532 Register DataReg =
MI.getOperand(0).getReg();
3577 "this should not have been custom lowered");
3582 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3584 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3588 .setMemRefs(
MI.memoperands());
3590 MI.eraseFromParent();
3598 switch (
DefMI->getOpcode()) {
3599 case TargetOpcode::G_INTRINSIC: {
3601 case Intrinsic::amdgcn_frexp_mant:
3602 case Intrinsic::amdgcn_log:
3603 case Intrinsic::amdgcn_log_clamp:
3604 case Intrinsic::amdgcn_exp2:
3605 case Intrinsic::amdgcn_sqrt:
3613 case TargetOpcode::G_FSQRT:
3615 case TargetOpcode::G_FFREXP: {
3616 if (
DefMI->getOperand(0).getReg() == Src)
3620 case TargetOpcode::G_FPEXT: {
3641std::pair<Register, Register>
3643 unsigned Flags)
const {
3648 auto SmallestNormal =
B.buildFConstant(
3650 auto IsLtSmallestNormal =
3653 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3654 auto One =
B.buildFConstant(
F32, 1.0);
3656 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3657 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3659 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3672 LLT Ty =
B.getMRI()->getType(Dst);
3673 unsigned Flags =
MI.getFlags();
3678 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3679 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3680 .addUse(Ext.getReg(0))
3682 B.buildFPTrunc(Dst,
Log2, Flags);
3683 MI.eraseFromParent();
3691 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3694 MI.eraseFromParent();
3698 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3699 .addUse(ScaledInput)
3702 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3703 auto Zero =
B.buildFConstant(Ty, 0.0);
3705 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3706 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3708 MI.eraseFromParent();
3714 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3715 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3720 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3721 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3726 unsigned Flags =
MI.getFlags();
3739 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3741 B.buildFPTrunc(Dst, LogVal);
3746 MI.eraseFromParent();
3755 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3758 if (ST.hasFastFMAF32()) {
3760 const float c_log10 = 0x1.344134p-2f;
3761 const float cc_log10 = 0x1.09f79ep-26f;
3764 const float c_log = 0x1.62e42ep-1f;
3765 const float cc_log = 0x1.efa39ep-25f;
3767 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3768 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3772 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3773 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3774 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3775 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3776 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3779 const float ch_log10 = 0x1.344000p-2f;
3780 const float ct_log10 = 0x1.3509f6p-18f;
3783 const float ch_log = 0x1.62e000p-1f;
3784 const float ct_log = 0x1.0bfbe8p-15f;
3786 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3787 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3789 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3790 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3791 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3795 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3798 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3800 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3803 const bool IsFiniteOnly =
3806 if (!IsFiniteOnly) {
3809 auto Fabs =
B.buildFAbs(Ty,
Y);
3812 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3816 auto Zero =
B.buildFConstant(Ty, 0.0);
3818 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3819 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3820 B.buildFSub(Dst, R, Shift, Flags);
3822 B.buildCopy(Dst, R);
3825 MI.eraseFromParent();
3831 unsigned Flags)
const {
3832 const double Log2BaseInverted =
3835 LLT Ty =
B.getMRI()->getType(Dst);
3840 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3843 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3844 auto Zero =
B.buildFConstant(Ty, 0.0);
3846 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3847 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3849 if (ST.hasFastFMAF32())
3850 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3852 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3853 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3861 ?
B.buildFLog2(Ty, Src, Flags)
3862 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3865 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3866 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3877 unsigned Flags =
MI.getFlags();
3878 LLT Ty =
B.getMRI()->getType(Dst);
3888 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3889 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3890 .addUse(Ext.getReg(0))
3892 B.buildFPTrunc(Dst,
Log2, Flags);
3893 MI.eraseFromParent();
3903 MI.eraseFromParent();
3911 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3913 RangeCheckConst, Flags);
3915 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3916 auto Zero =
B.buildFConstant(Ty, 0.0);
3917 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3918 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3920 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3921 .addUse(AddInput.getReg(0))
3924 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3925 auto One =
B.buildFConstant(Ty, 1.0);
3926 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3927 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3928 MI.eraseFromParent();
3933 const SrcOp &Src,
unsigned Flags) {
3934 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3937 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3938 .addUse(Src.getReg())
3941 return B.buildFExp2(Dst, Src, Flags);
3947 bool IsExp10)
const {
3948 LLT Ty =
B.getMRI()->getType(
X);
3952 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3953 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3960 LLT Ty =
B.getMRI()->getType(Dst);
3967 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3970 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3971 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3972 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3975 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3977 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3978 .addUse(ExpInput.getReg(0))
3981 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3982 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3983 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3989 unsigned Flags)
const {
3990 LLT Ty =
B.getMRI()->getType(Dst);
3995 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3996 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3998 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3999 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
4000 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
4001 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
4002 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
4012 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
4016 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
4017 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
4018 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
4020 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
4021 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
4023 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
4024 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
4025 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
4026 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
4028 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
4029 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
4030 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4032 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4051 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
4053 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
4055 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
4057 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4058 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4059 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
4060 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
4062 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
4063 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
4064 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4065 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4067 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4068 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4069 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4070 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4071 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4073 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4074 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4075 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4076 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4079 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4080 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4081 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4083 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4084 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4085 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4086 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4087 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4091 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4092 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4094 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4096 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4098 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4100 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4102 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4103 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4104 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4105 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4107 auto One =
B.buildFConstant(
S64, 1.0);
4108 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4109 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4112 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4113 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4120 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4127 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4129 MI.eraseFromParent();
4137 const unsigned Flags =
MI.getFlags();
4149 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4157 MI.eraseFromParent();
4168 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4171 B.buildFPTrunc(Dst, Lowered, Flags);
4172 MI.eraseFromParent();
4183 MI.eraseFromParent();
4211 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4214 if (ST.hasFastFMAF32()) {
4216 const float cc_exp = 0x1.4ae0bep-26f;
4217 const float c_exp10 = 0x1.a934f0p+1f;
4218 const float cc_exp10 = 0x1.2f346ep-24f;
4220 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4221 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4222 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4223 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4225 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4226 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4228 const float ch_exp = 0x1.714000p+0f;
4229 const float cl_exp = 0x1.47652ap-12f;
4231 const float ch_exp10 = 0x1.a92000p+1f;
4232 const float cl_exp10 = 0x1.4f0978p-11f;
4234 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4235 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4236 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4238 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4239 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4241 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4242 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4245 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4246 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4249 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4252 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4253 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4256 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4257 .addUse(
A.getReg(0))
4259 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4261 auto UnderflowCheckConst =
4262 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4263 auto Zero =
B.buildFConstant(Ty, 0.0);
4267 R =
B.buildSelect(Ty, Underflow, Zero, R);
4270 auto OverflowCheckConst =
4271 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4276 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4279 B.buildCopy(Dst, R);
4280 MI.eraseFromParent();
4289 unsigned Flags =
MI.getFlags();
4290 LLT Ty =
B.getMRI()->getType(Dst);
4295 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4296 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4297 .addUse(Log.getReg(0))
4300 B.buildFExp2(Dst,
Mul, Flags);
4301 }
else if (Ty == F16) {
4303 auto Log =
B.buildFLog2(F16, Src0, Flags);
4304 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4305 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4306 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4307 .addUse(Ext0.getReg(0))
4308 .addUse(Ext1.getReg(0))
4310 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4314 MI.eraseFromParent();
4322 ModSrc = SrcFNeg->getOperand(1).getReg();
4324 ModSrc = SrcFAbs->getOperand(1).getReg();
4326 ModSrc = SrcFAbs->getOperand(1).getReg();
4337 Register OrigSrc =
MI.getOperand(1).getReg();
4338 unsigned Flags =
MI.getFlags();
4340 "this should not have been custom lowered");
4350 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4370 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4372 B.buildFMinNum(Min, Fract, Const, Flags);
4377 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4380 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4381 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4383 MI.eraseFromParent();
4399 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4401 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4402 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4405 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4406 B.buildBitcast(Dst,
Merge);
4408 MI.eraseFromParent();
4425 bool UsePartialMad64_32,
4426 bool SeparateOddAlignedProducts)
const {
4441 auto getZero32 = [&]() ->
Register {
4443 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4446 auto getZero64 = [&]() ->
Register {
4448 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4453 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4464 if (CarryIn.empty())
4467 bool HaveCarryOut =
true;
4469 if (CarryIn.size() == 1) {
4471 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4475 CarryAccum = getZero32();
4477 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4478 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4480 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4485 LocalAccum = getZero32();
4486 HaveCarryOut =
false;
4491 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4492 LocalAccum =
Add.getReg(0);
4506 auto buildMadChain =
4509 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4510 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4517 if (LocalAccum.size() == 1 &&
4518 (!UsePartialMad64_32 || !CarryIn.empty())) {
4521 unsigned j1 = DstIndex - j0;
4522 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4526 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4528 LocalAccum[0] =
Mul.getReg(0);
4530 if (CarryIn.empty()) {
4531 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4534 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4540 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4544 if (j0 <= DstIndex) {
4545 bool HaveSmallAccum =
false;
4548 if (LocalAccum[0]) {
4549 if (LocalAccum.size() == 1) {
4550 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4551 HaveSmallAccum =
true;
4552 }
else if (LocalAccum[1]) {
4553 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4554 HaveSmallAccum =
false;
4556 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4557 HaveSmallAccum =
true;
4560 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4562 HaveSmallAccum =
true;
4566 unsigned j1 = DstIndex - j0;
4567 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4571 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4572 {Src0[j0], Src1[j1], Tmp});
4573 Tmp = Mad.getReg(0);
4574 if (!HaveSmallAccum)
4575 CarryOut.push_back(Mad.getReg(1));
4576 HaveSmallAccum =
false;
4579 }
while (j0 <= DstIndex);
4581 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4582 LocalAccum[0] = Unmerge.getReg(0);
4583 if (LocalAccum.size() > 1)
4584 LocalAccum[1] = Unmerge.getReg(1);
4611 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4612 Carry OddCarryIn = std::move(OddCarry);
4613 Carry EvenCarryIn = std::move(EvenCarry);
4618 if (2 * i < Accum.
size()) {
4619 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4620 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4625 if (!SeparateOddAlignedProducts) {
4626 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4627 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4629 bool IsHighest = 2 * i >= Accum.
size();
4632 .take_front(IsHighest ? 1 : 2);
4633 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4639 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4641 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4643 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4646 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4649 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4650 Lo->getOperand(1).getReg());
4651 Accum[2 * i] =
Hi.getReg(0);
4652 SeparateOddCarry =
Hi.getReg(1);
4659 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4660 EvenCarryIn.push_back(CarryOut);
4662 if (2 * i < Accum.
size()) {
4663 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4664 OddCarry.push_back(CarryOut);
4676 assert(ST.hasMad64_32());
4677 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4689 unsigned Size = Ty.getSizeInBits();
4690 if (ST.hasVMulU64Inst() &&
Size == 64)
4693 unsigned NumParts =
Size / 32;
4705 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4709 for (
unsigned i = 0; i < NumParts; ++i) {
4713 B.buildUnmerge(Src0Parts, Src0);
4714 B.buildUnmerge(Src1Parts, Src1);
4717 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4718 SeparateOddAlignedProducts);
4720 B.buildMergeLikeInstr(DstReg, AccumRegs);
4721 MI.eraseFromParent();
4736 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4737 ? AMDGPU::G_AMDGPU_FFBH_U32
4738 : AMDGPU::G_AMDGPU_FFBL_B32;
4739 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4742 MI.eraseFromParent();
4752 TypeSize NumBits = SrcTy.getSizeInBits();
4756 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4757 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4758 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4759 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4760 B.buildTrunc(Dst, Ctlz);
4761 MI.eraseFromParent();
4772 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4773 unsigned BitWidth = SrcTy.getSizeInBits();
4775 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4777 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4778 MI.eraseFromParent();
4784 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4787 return ConstVal == -1;
4794 Register CondDef =
MI.getOperand(0).getReg();
4813 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4822 UncondBrTarget = &*NextMBB;
4824 if (
Next->getOpcode() != AMDGPU::G_BR)
4843 *ArgRC,
B.getDebugLoc(), ArgTy);
4847 const unsigned Mask = Arg->
getMask();
4855 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4856 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4859 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4861 B.buildCopy(DstReg, LiveIn);
4871 if (!ST.hasClusters()) {
4874 MI.eraseFromParent();
4894 auto One =
B.buildConstant(
S32, 1);
4895 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4896 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4897 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4904 B.buildCopy(DstReg, GlobalIdXYZ);
4905 MI.eraseFromParent();
4909 B.buildCopy(DstReg, ClusterIdXYZ);
4910 MI.eraseFromParent();
4915 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4917 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4918 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4920 .addImm(ClusterIdField);
4921 auto Zero =
B.buildConstant(
S32, 0);
4924 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4925 MI.eraseFromParent();
4967 auto LoadConstant = [&](
unsigned N) {
4968 B.buildConstant(DstReg,
N);
4972 if (ST.hasArchitectedSGPRs() &&
4979 Arg = &WorkGroupIDX;
4980 ArgRC = &AMDGPU::SReg_32RegClass;
4984 Arg = &WorkGroupIDY;
4985 ArgRC = &AMDGPU::SReg_32RegClass;
4989 Arg = &WorkGroupIDZ;
4990 ArgRC = &AMDGPU::SReg_32RegClass;
4994 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4995 return LoadConstant(0);
4996 Arg = &ClusterWorkGroupIDX;
4997 ArgRC = &AMDGPU::SReg_32RegClass;
5001 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
5002 return LoadConstant(0);
5003 Arg = &ClusterWorkGroupIDY;
5004 ArgRC = &AMDGPU::SReg_32RegClass;
5008 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
5009 return LoadConstant(0);
5010 Arg = &ClusterWorkGroupIDZ;
5011 ArgRC = &AMDGPU::SReg_32RegClass;
5016 return LoadConstant(ClusterDims.
getDims()[0] - 1);
5017 Arg = &ClusterWorkGroupMaxIDX;
5018 ArgRC = &AMDGPU::SReg_32RegClass;
5023 return LoadConstant(ClusterDims.
getDims()[1] - 1);
5024 Arg = &ClusterWorkGroupMaxIDY;
5025 ArgRC = &AMDGPU::SReg_32RegClass;
5030 return LoadConstant(ClusterDims.
getDims()[2] - 1);
5031 Arg = &ClusterWorkGroupMaxIDZ;
5032 ArgRC = &AMDGPU::SReg_32RegClass;
5036 Arg = &ClusterWorkGroupMaxFlatID;
5037 ArgRC = &AMDGPU::SReg_32RegClass;
5052 return LoadConstant(0);
5057 B.buildUndef(DstReg);
5061 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5073 MI.eraseFromParent();
5079 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5080 MI.eraseFromParent();
5087 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5101 B.buildUndef(DstReg);
5102 MI.eraseFromParent();
5106 if (Arg->isMasked()) {
5120 MI.eraseFromParent();
5135 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5144 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5152 Align Alignment)
const {
5156 "unexpected kernarg parameter type");
5163 MI.eraseFromParent();
5198 auto FloatY =
B.buildUITOFP(
S32,
Y);
5199 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5201 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5202 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5205 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5206 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5207 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5210 auto Q =
B.buildUMulH(
S32,
X, Z);
5211 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5214 auto One =
B.buildConstant(
S32, 1);
5217 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5223 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5226 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5245 auto Unmerge =
B.buildUnmerge(
S32, Val);
5247 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5248 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5250 auto Mad =
B.buildFMAD(
5254 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5255 auto Mul1 =
B.buildFMul(
5259 auto Mul2 =
B.buildFMul(
5261 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5264 auto Mad2 =
B.buildFMAD(
5268 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5269 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5271 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5286 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5288 auto Zero64 =
B.buildConstant(
S64, 0);
5289 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5291 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5292 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5294 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5295 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5296 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5298 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5299 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5300 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5302 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5303 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5304 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5305 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5306 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5308 auto Zero32 =
B.buildConstant(
S32, 0);
5309 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5310 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5311 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5313 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5314 Register NumerLo = UnmergeNumer.getReg(0);
5315 Register NumerHi = UnmergeNumer.getReg(1);
5317 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5318 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5319 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5320 Register Mul3_Lo = UnmergeMul3.getReg(0);
5321 Register Mul3_Hi = UnmergeMul3.getReg(1);
5322 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5323 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5324 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5325 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5327 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5328 Register DenomLo = UnmergeDenom.getReg(0);
5329 Register DenomHi = UnmergeDenom.getReg(1);
5332 auto C1 =
B.buildSExt(
S32, CmpHi);
5335 auto C2 =
B.buildSExt(
S32, CmpLo);
5338 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5345 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5346 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5347 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5348 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5350 auto One64 =
B.buildConstant(
S64, 1);
5351 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5357 auto C6 =
B.buildSelect(
5361 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5362 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5364 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5365 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5366 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5372 auto Sel1 =
B.buildSelect(
5379 auto Sel2 =
B.buildSelect(
5390 switch (
MI.getOpcode()) {
5393 case AMDGPU::G_UDIV: {
5394 DstDivReg =
MI.getOperand(0).getReg();
5397 case AMDGPU::G_UREM: {
5398 DstRemReg =
MI.getOperand(0).getReg();
5401 case AMDGPU::G_UDIVREM: {
5402 DstDivReg =
MI.getOperand(0).getReg();
5403 DstRemReg =
MI.getOperand(1).getReg();
5410 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5411 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5412 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5422 MI.eraseFromParent();
5433 if (Ty !=
S32 && Ty !=
S64)
5436 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5437 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5438 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5440 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5441 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5442 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5444 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5445 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5447 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5448 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5450 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5451 switch (
MI.getOpcode()) {
5454 case AMDGPU::G_SDIV: {
5455 DstDivReg =
MI.getOperand(0).getReg();
5459 case AMDGPU::G_SREM: {
5460 DstRemReg =
MI.getOperand(0).getReg();
5464 case AMDGPU::G_SDIVREM: {
5465 DstDivReg =
MI.getOperand(0).getReg();
5466 DstRemReg =
MI.getOperand(1).getReg();
5479 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5480 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5481 B.buildSub(DstDivReg, SignXor, Sign);
5485 auto Sign = LHSign.getReg(0);
5486 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5487 B.buildSub(DstRemReg, SignXor, Sign);
5490 MI.eraseFromParent();
5506 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5517 if (CLHS->isExactlyValue(1.0)) {
5518 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5522 MI.eraseFromParent();
5527 if (CLHS->isExactlyValue(-1.0)) {
5528 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5529 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5530 .addUse(FNeg.getReg(0))
5533 MI.eraseFromParent();
5540 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5545 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5548 B.buildFMul(Res, LHS, RCP, Flags);
5550 MI.eraseFromParent();
5565 if (!AllowInaccurateRcp)
5573 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5575 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5576 auto One =
B.buildFConstant(ResTy, 1.0);
5578 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5582 R =
B.buildFNeg(ResTy, R);
5584 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5585 R =
B.buildFMA(ResTy, Tmp0, R, R);
5587 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5588 R =
B.buildFMA(ResTy, Tmp1, R, R);
5592 B.buildCopy(Res, R);
5593 MI.eraseFromParent();
5597 auto Ret =
B.buildFMul(ResTy,
X, R);
5598 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5600 B.buildFMA(Res, Tmp2, R, Ret);
5601 MI.eraseFromParent();
5633 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5634 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5635 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5636 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5637 .addUse(RHSExt.getReg(0))
5639 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5641 if (ST.hasMadMacF32Insts()) {
5642 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5643 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5644 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5646 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5647 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5648 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5650 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5651 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5652 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5653 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5654 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5655 .addUse(RDst.getReg(0))
5660 MI.eraseFromParent();
5673 unsigned SPDenormMode =
5676 if (ST.hasDenormModeInst()) {
5678 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5680 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5681 B.buildInstr(AMDGPU::S_DENORM_MODE)
5682 .addImm(NewDenormModeValue);
5685 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5686 .addImm(SPDenormMode)
5708 auto One =
B.buildFConstant(
S32, 1.0f);
5710 auto DenominatorScaled =
5711 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5716 auto NumeratorScaled =
5717 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5723 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5724 .addUse(DenominatorScaled.getReg(0))
5726 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5729 const bool HasDynamicDenormals =
5734 if (!PreservesDenormals) {
5735 if (HasDynamicDenormals) {
5737 B.buildInstr(AMDGPU::S_GETREG_B32)
5738 .addDef(SavedSPDenormMode)
5744 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5745 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5746 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5747 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5748 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5749 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5751 if (!PreservesDenormals) {
5752 if (HasDynamicDenormals) {
5753 assert(SavedSPDenormMode);
5754 B.buildInstr(AMDGPU::S_SETREG_B32)
5755 .addReg(SavedSPDenormMode)
5761 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5762 .addUse(Fma4.getReg(0))
5763 .addUse(Fma1.getReg(0))
5764 .addUse(Fma3.getReg(0))
5765 .addUse(NumeratorScaled.getReg(1))
5768 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5769 .addUse(Fmas.getReg(0))
5774 MI.eraseFromParent();
5793 auto One =
B.buildFConstant(
S64, 1.0);
5795 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5801 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5803 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5804 .addUse(DivScale0.getReg(0))
5807 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5808 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5809 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5811 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5817 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5818 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5819 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5822 if (!ST.hasUsableDivScaleConditionOutput()) {
5828 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5829 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5830 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5831 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5834 Scale1Unmerge.getReg(1));
5836 Scale0Unmerge.getReg(1));
5837 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5839 Scale = DivScale1.getReg(1);
5842 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5843 .addUse(Fma4.getReg(0))
5844 .addUse(Fma3.getReg(0))
5845 .addUse(
Mul.getReg(0))
5849 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5850 .addUse(Fmas.getReg(0))
5855 MI.eraseFromParent();
5870 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5873 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5877 if (ST.hasFractBug()) {
5878 auto Fabs =
B.buildFAbs(Ty, Val);
5882 auto Zero =
B.buildConstant(InstrExpTy, 0);
5883 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5884 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5887 B.buildCopy(Res0, Mant);
5888 B.buildSExtOrTrunc(Res1, Exp);
5890 MI.eraseFromParent();
5905 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5908 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5909 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5910 auto C2 =
B.buildFConstant(
S32, 1.0f);
5913 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5915 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5917 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5918 .addUse(Mul0.getReg(0))
5921 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5923 B.buildFMul(Res, Sel, Mul1, Flags);
5925 MI.eraseFromParent();
5934 unsigned Flags =
MI.getFlags();
5935 assert(!ST.has16BitInsts());
5937 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5938 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5939 .addUse(Ext.getReg(0))
5941 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5942 MI.eraseFromParent();
5952 const unsigned Flags =
MI.getFlags();
5961 MI.eraseFromParent();
5965 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5967 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5968 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5969 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5974 .addUse(SqrtX.getReg(0))
5977 auto NegOne =
B.buildConstant(I32, -1);
5978 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5980 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5981 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5983 auto PosOne =
B.buildConstant(I32, 1);
5984 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5986 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5987 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5989 auto Zero =
B.buildFConstant(
F32, 0.0f);
5993 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5997 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
6000 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
6001 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
6003 auto Half =
B.buildFConstant(
F32, 0.5f);
6004 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
6005 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
6006 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
6007 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
6008 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
6009 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
6010 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
6011 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
6014 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
6016 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
6018 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
6021 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
6023 MI.eraseFromParent();
6058 unsigned Flags =
MI.getFlags();
6063 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6065 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6069 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6070 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6071 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6074 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6076 auto Half =
B.buildFConstant(
F64, 0.5);
6077 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6078 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6080 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6081 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6083 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6084 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6086 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6087 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6089 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6091 Register SqrtRet = SqrtS2.getReg(0);
6093 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6094 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6095 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6098 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6099 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6100 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6105 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6114 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6116 MI.eraseFromParent();
6147 auto Flags =
MI.getFlags();
6159 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6169 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6170 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6175 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6177 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6178 MI.eraseFromParent();
6190 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6191 IID == Intrinsic::amdgcn_permlanex16;
6192 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6193 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6194 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6195 IID == Intrinsic::amdgcn_permlane_up ||
6196 IID == Intrinsic::amdgcn_permlane_down ||
6197 IID == Intrinsic::amdgcn_permlane_xor;
6201 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6203 case Intrinsic::amdgcn_readfirstlane:
6204 case Intrinsic::amdgcn_permlane64:
6205 return LaneOp.getReg(0);
6206 case Intrinsic::amdgcn_readlane:
6207 case Intrinsic::amdgcn_set_inactive:
6208 case Intrinsic::amdgcn_set_inactive_chain_arg:
6209 return LaneOp.addUse(Src1).getReg(0);
6210 case Intrinsic::amdgcn_writelane:
6211 case Intrinsic::amdgcn_permlane_bcast:
6212 case Intrinsic::amdgcn_permlane_up:
6213 case Intrinsic::amdgcn_permlane_down:
6214 case Intrinsic::amdgcn_permlane_xor:
6215 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6216 case Intrinsic::amdgcn_permlane16:
6217 case Intrinsic::amdgcn_permlanex16: {
6219 int64_t Src4 =
MI.getOperand(6).getImm();
6220 int64_t Src5 =
MI.getOperand(7).getImm();
6221 return LaneOp.addUse(Src1)
6228 case Intrinsic::amdgcn_mov_dpp8:
6229 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6230 case Intrinsic::amdgcn_update_dpp:
6231 return LaneOp.addUse(Src1)
6232 .addImm(
MI.getOperand(4).getImm())
6233 .addImm(
MI.getOperand(5).getImm())
6234 .addImm(
MI.getOperand(6).getImm())
6235 .addImm(
MI.getOperand(7).getImm())
6245 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6246 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6247 IsPermlaneShuffle) {
6248 Src1 =
MI.getOperand(3).getReg();
6249 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6250 IsPermlaneShuffle) {
6251 Src2 =
MI.getOperand(4).getReg();
6256 unsigned Size = Ty.getSizeInBits();
6258 unsigned SplitSize = 32;
6259 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6260 ST.hasDPALU_DPP() &&
6264 if (
Size == SplitSize) {
6270 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6272 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6275 if (IID == Intrinsic::amdgcn_writelane)
6278 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6279 B.buildTrunc(DstReg, LaneOpDst);
6280 MI.eraseFromParent();
6284 if (
Size % SplitSize != 0)
6288 bool NeedsBitcast =
false;
6289 if (Ty.isVector()) {
6292 if (EltSize == SplitSize) {
6293 PartialResTy = EltTy;
6294 }
else if (EltSize == 16 || EltSize == 32) {
6295 unsigned NElem = SplitSize / EltSize;
6299 NeedsBitcast =
true;
6304 unsigned NumParts =
Size / SplitSize;
6308 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6309 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6311 if (IID == Intrinsic::amdgcn_writelane)
6312 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6314 for (
unsigned i = 0; i < NumParts; ++i) {
6315 Src0 = Src0Parts.
getReg(i);
6317 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6318 Src1 = Src1Parts.
getReg(i);
6320 if (IID == Intrinsic::amdgcn_writelane)
6321 Src2 = Src2Parts.
getReg(i);
6323 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6327 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6330 B.buildMergeLikeInstr(DstReg, PartialRes);
6332 MI.eraseFromParent();
6340 ST.getTargetLowering()->getImplicitParameterOffset(
6350 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6351 B.buildConstant(IdxTy,
Offset).getReg(0));
6362 Register Pointer =
MI.getOperand(2).getReg();
6364 Register NumRecords =
MI.getOperand(4).getReg();
6370 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6372 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6374 if (ST.has45BitNumRecordsBufferResource()) {
6379 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6380 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6381 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6382 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6386 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6387 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6388 auto ExtShiftedStride =
6389 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6390 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6391 auto ExtShiftedFlags =
6392 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6393 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6395 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6396 B.buildMergeValues(Result, {LowHalf, HighHalf});
6398 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6399 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6400 auto LowHalf = Unmerge.getReg(0);
6401 auto HighHalf = Unmerge.getReg(1);
6403 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6404 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6405 auto ShiftConst =
B.buildConstant(
S32, 16);
6406 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6407 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6408 Register NewHighHalfReg = NewHighHalf.getReg(0);
6409 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6412 MI.eraseFromParent();
6429 MI.eraseFromParent();
6437 std::optional<uint32_t> KnownSize =
6439 if (KnownSize.has_value())
6440 B.buildConstant(DstReg, *KnownSize);
6458 MI.eraseFromParent();
6465 unsigned AddrSpace)
const {
6467 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6471 ST.hasGloballyAddressableScratch()) {
6473 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6474 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6476 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6478 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6480 B.buildConstant(
S32, 1u << 26));
6485 MI.eraseFromParent();
6495std::pair<Register, unsigned>
6507 bool CheckNUW = ST.hasGFX1250Insts();
6509 MRI, OrigOffset,
nullptr, CheckNUW);
6513 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6523 unsigned Overflow = ImmOffset & ~MaxImm;
6524 ImmOffset -= Overflow;
6525 if ((int32_t)Overflow < 0) {
6526 Overflow += ImmOffset;
6530 if (Overflow != 0) {
6532 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6534 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6535 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6540 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6542 return std::pair(BaseReg, ImmOffset);
6549 bool ImageStore)
const {
6555 if (ST.hasUnpackedD16VMem()) {
6556 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6559 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6560 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6568 if (ImageStore && ST.hasImageStoreD16Bug()) {
6571 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6573 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6580 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6581 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6583 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6591 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6592 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6594 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6611 bool IsFormat)
const {
6623 VData =
B.buildBitcast(Ty, VData).getReg(0);
6631 if (Ty.isVector()) {
6632 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6644 bool IsFormat)
const {
6651 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6666 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6669 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6673 VIndex =
MI.getOperand(3).getReg();
6676 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6679 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6680 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6684 Format =
MI.getOperand(5 + OpOffset).getImm();
6688 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6694 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6695 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6696 }
else if (IsFormat) {
6697 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6698 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6702 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6705 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6708 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6713 auto MIB =
B.buildInstr(
Opc)
6724 MIB.addImm(AuxiliaryData)
6725 .addImm(HasVIndex ? -1 : 0)
6726 .addMemOperand(MMO);
6728 MI.eraseFromParent();
6734 unsigned ImmOffset,
unsigned Format,
6737 auto MIB =
B.buildInstr(
Opc)
6748 MIB.addImm(AuxiliaryData)
6749 .addImm(HasVIndex ? -1 : 0)
6750 .addMemOperand(MMO);
6756 bool IsTyped)
const {
6770 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6771 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6773 StatusDst =
MI.getOperand(1).getReg();
6778 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6781 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6784 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6787 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6790 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6793 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6794 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6798 Format =
MI.getOperand(5 + OpOffset).getImm();
6802 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6812 Dst =
MI.getOperand(0).getReg();
6813 B.setInsertPt(
B.getMBB(),
MI);
6820 Dst =
MI.getOperand(0).getReg();
6821 B.setInsertPt(
B.getMBB(),
MI);
6825 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6826 const bool Unpacked = ST.hasUnpackedD16VMem();
6836 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6837 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6838 }
else if (IsFormat) {
6842 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6844 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6845 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6850 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6851 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6854 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6855 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6858 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6859 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6865 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6866 unsigned NumLoadDWords = NumValueDWords + 1;
6868 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6870 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6872 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6873 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6874 B.buildTrunc(Dst, ExtDst);
6875 }
else if (NumValueDWords == 1) {
6876 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6879 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6880 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6882 B.buildUnmerge(LoadElts, LoadDstReg);
6884 B.buildMergeLikeInstr(Dst, LoadElts);
6887 (IsD16 && !Ty.isVector())) {
6888 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6890 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6891 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6892 B.buildTrunc(Dst, LoadDstReg);
6893 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6895 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6897 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6898 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6900 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6902 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6903 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6904 B.buildMergeLikeInstr(Dst, Repack);
6907 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6910 MI.eraseFromParent();
6916 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6961 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6963 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6966 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6968 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6971 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6973 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6976 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6977 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6978 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6979 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6980 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6981 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6982 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6983 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6984 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6985 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6986 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6987 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6988 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6989 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6990 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6991 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6992 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6993 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6994 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6995 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6996 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6997 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6998 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6999 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
7000 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
7001 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
7002 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
7003 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
7004 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
7005 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
7014 const bool IsCmpSwap =
7015 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
7016 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
7017 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
7018 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
7029 CmpVal =
MI.getOperand(3).getReg();
7034 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
7035 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7038 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
7041 VIndex =
MI.getOperand(4 + OpOffset).getReg();
7044 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
7047 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
7048 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
7049 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
7068 .addImm(AuxiliaryData)
7069 .addImm(HasVIndex ? -1 : 0)
7070 .addMemOperand(MMO);
7072 MI.eraseFromParent();
7082 bool IsA16,
bool IsG16) {
7098 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7103 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7107 "Bias needs to be converted to 16 bit in A16 mode");
7109 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7115 if (((
I + 1) >= EndIdx) ||
7122 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7124 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7129 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7140 int DimIdx,
int NumVAddrs) {
7144 for (
int I = 0;
I != NumVAddrs; ++
I) {
7146 if (
SrcOp.isReg()) {
7152 int NumAddrRegs = AddrRegs.
size();
7153 if (NumAddrRegs != 1) {
7156 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7159 for (
int I = 1;
I != NumVAddrs; ++
I) {
7162 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7184 const unsigned NumDefs =
MI.getNumExplicitDefs();
7185 const unsigned ArgOffset = NumDefs + 1;
7186 bool IsTFE = NumDefs == 2;
7204 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7208 const bool IsAtomicPacked16Bit =
7209 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7210 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7218 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7219 const bool IsA16 = AddrTy ==
S16;
7220 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7223 if (!BaseOpcode->
Atomic) {
7224 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7227 }
else if (DMask != 0) {
7229 }
else if (!IsTFE && !BaseOpcode->
Store) {
7231 B.buildUndef(
MI.getOperand(0));
7232 MI.eraseFromParent();
7240 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7241 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7242 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7243 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7244 unsigned NewOpcode = LoadOpcode;
7245 if (BaseOpcode->
Store)
7246 NewOpcode = StoreOpcode;
7248 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7251 MI.setDesc(
B.getTII().get(NewOpcode));
7255 if (IsTFE && DMask == 0) {
7258 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7261 if (BaseOpcode->
Atomic) {
7266 if (Ty.isVector() && !IsAtomicPacked16Bit)
7273 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7274 MI.getOperand(2).setReg(
Concat.getReg(0));
7275 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7279 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7282 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7288 if (IsA16 && !ST.hasA16()) {
7293 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7294 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7296 if (IsA16 || IsG16) {
7304 const bool UseNSA = ST.hasNSAEncoding() &&
7305 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7306 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7307 const bool UsePartialNSA =
7308 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7310 if (UsePartialNSA) {
7314 auto Concat =
B.buildConcatVectors(
7315 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7316 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7317 PackedRegs.
resize(NSAMaxSize);
7318 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7320 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7321 PackedRegs[0] =
Concat.getReg(0);
7325 const unsigned NumPacked = PackedRegs.
size();
7328 if (!
SrcOp.isReg()) {
7338 SrcOp.setReg(AMDGPU::NoRegister);
7355 const bool UseNSA = ST.hasNSAEncoding() &&
7356 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7357 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7358 const bool UsePartialNSA =
7359 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7361 if (UsePartialNSA) {
7363 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7365 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7380 if (!Ty.isVector() || !IsD16)
7384 if (RepackedReg != VData) {
7385 MI.getOperand(1).setReg(RepackedReg);
7393 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7396 if (NumElts < DMaskLanes)
7399 if (NumElts > 4 || DMaskLanes > 4)
7409 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7410 const LLT AdjustedTy =
7426 if (IsD16 && ST.hasUnpackedD16VMem()) {
7433 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7434 unsigned RoundedSize = 32 * RoundedElts;
7438 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7443 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7449 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7453 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7454 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7458 MI.getOperand(0).setReg(NewResultReg);
7466 Dst1Reg =
MI.getOperand(1).getReg();
7471 MI.removeOperand(1);
7475 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7484 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7486 if (ResultNumRegs == 1) {
7488 ResultRegs[0] = NewResultReg;
7491 for (
int I = 0;
I != NumDataRegs; ++
I)
7493 B.buildUnmerge(ResultRegs, NewResultReg);
7498 ResultRegs.
resize(NumDataRegs);
7503 if (IsD16 && !Ty.isVector()) {
7504 B.buildTrunc(DstReg, ResultRegs[0]);
7509 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7510 B.buildBitcast(DstReg, ResultRegs[0]);
7522 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7524 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7525 }
else if (ST.hasUnpackedD16VMem()) {
7527 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7531 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7535 for (
int I = 0;
I != NumElts; ++
I)
7542 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7543 B.buildBuildVector(DstReg, ResultRegs);
7547 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7548 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7554 if (ResultRegs.
size() == 1) {
7555 NewResultReg = ResultRegs[0];
7556 }
else if (ResultRegs.
size() == 2) {
7558 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7566 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7568 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7573 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7574 B.buildConcatVectors(DstReg, ResultRegs);
7583 Register OrigDst =
MI.getOperand(0).getReg();
7585 LLT Ty =
B.getMRI()->getType(OrigDst);
7586 unsigned Size = Ty.getSizeInBits();
7589 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7591 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7592 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7595 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7597 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7606 B.setInsertPt(
B.getMBB(),
MI);
7611 B.setInsertPt(
B.getMBB(),
MI);
7617 MI.setDesc(
B.getTII().get(
Opc));
7618 MI.removeOperand(1);
7621 const unsigned MemSize = (
Size + 7) / 8;
7622 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7629 MI.addMemOperand(MF, MMO);
7630 if (Dst != OrigDst) {
7631 MI.getOperand(0).setReg(Dst);
7632 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7633 B.buildTrunc(OrigDst, Dst);
7655 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7656 MI.removeOperand(0);
7666 if (!ST.hasTrapHandler() ||
7670 return ST.supportsGetDoorbellID() ?
7683 MI.eraseFromParent();
7693 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7695 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7699 MI.eraseFromParent();
7708 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7715 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7735 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7738 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7739 B.buildCopy(SGPR01, Temp);
7740 B.buildInstr(AMDGPU::S_TRAP)
7743 MI.eraseFromParent();
7754 B.buildCopy(SGPR01, LiveIn);
7755 B.buildInstr(AMDGPU::S_TRAP)
7759 MI.eraseFromParent();
7768 if (ST.hasPrivEnabledTrap2NopBug()) {
7769 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7771 MI.eraseFromParent();
7775 B.buildInstr(AMDGPU::S_TRAP)
7777 MI.eraseFromParent();
7786 if (!ST.hasTrapHandler() ||
7790 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7793 B.buildInstr(AMDGPU::S_TRAP)
7797 MI.eraseFromParent();
7810 Register NodePtr =
MI.getOperand(2).getReg();
7811 Register RayExtent =
MI.getOperand(3).getReg();
7812 Register RayOrigin =
MI.getOperand(4).getReg();
7814 Register RayInvDir =
MI.getOperand(6).getReg();
7817 if (!ST.hasGFX10_AEncoding()) {
7820 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7829 const unsigned NumVDataDwords = 4;
7830 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7831 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7833 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7835 const unsigned BaseOpcodes[2][2] = {
7836 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7837 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7838 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7842 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7843 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7844 : AMDGPU::MIMGEncGfx10NSA,
7845 NumVDataDwords, NumVAddrDwords);
7849 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7850 : AMDGPU::MIMGEncGfx10Default,
7851 NumVDataDwords, NumVAddrDwords);
7856 if (UseNSA && IsGFX11Plus) {
7858 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7859 auto Merged =
B.buildMergeLikeInstr(
7860 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7861 Ops.push_back(Merged.getReg(0));
7864 Ops.push_back(NodePtr);
7865 Ops.push_back(RayExtent);
7866 packLanes(RayOrigin);
7869 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7870 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7871 auto MergedDir =
B.buildMergeLikeInstr(
7874 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7875 UnmergeRayDir.getReg(0)}))
7878 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7879 UnmergeRayDir.getReg(1)}))
7882 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7883 UnmergeRayDir.getReg(2)}))
7885 Ops.push_back(MergedDir.getReg(0));
7888 packLanes(RayInvDir);
7892 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7893 Ops.push_back(Unmerge.getReg(0));
7894 Ops.push_back(Unmerge.getReg(1));
7896 Ops.push_back(NodePtr);
7898 Ops.push_back(RayExtent);
7901 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7902 Ops.push_back(Unmerge.getReg(0));
7903 Ops.push_back(Unmerge.getReg(1));
7904 Ops.push_back(Unmerge.getReg(2));
7907 packLanes(RayOrigin);
7909 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7910 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7914 B.buildMergeLikeInstr(R1,
7915 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7916 B.buildMergeLikeInstr(
7917 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7918 B.buildMergeLikeInstr(
7919 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7925 packLanes(RayInvDir);
7932 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7934 Ops.push_back(MergedOps);
7937 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7946 .addImm(IsA16 ? 1 : 0)
7949 MI.eraseFromParent();
7959 Register DstOrigin =
MI.getOperand(1).getReg();
7961 Register NodePtr =
MI.getOperand(4).getReg();
7962 Register RayExtent =
MI.getOperand(5).getReg();
7963 Register InstanceMask =
MI.getOperand(6).getReg();
7964 Register RayOrigin =
MI.getOperand(7).getReg();
7966 Register Offsets =
MI.getOperand(9).getReg();
7967 Register TDescr =
MI.getOperand(10).getReg();
7969 if (!ST.hasBVHDualAndBVH8Insts()) {
7972 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7977 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7978 const unsigned NumVDataDwords = 10;
7979 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7981 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7982 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7983 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7986 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7987 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7989 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7990 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7996 .addUse(RayExtentInstanceMaskVec.getReg(0))
8003 MI.eraseFromParent();
8012 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
8013 MI.eraseFromParent();
8020 if (!ST.hasArchitectedSGPRs())
8024 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
8025 auto LSB =
B.buildConstant(
S32, 25);
8026 auto Width =
B.buildConstant(
S32, 5);
8027 B.buildUbfx(DstReg, TTMP8, LSB, Width);
8028 MI.eraseFromParent();
8036 unsigned Width)
const {
8040 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8041 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8044 MI.eraseFromParent();
8062 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8066 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8069 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8070 MI.eraseFromParent();
8081 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8085 .addReg(Unmerge.getReg(0));
8089 .addReg(Unmerge.getReg(1));
8090 MI.eraseFromParent();
8102 case Intrinsic::amdgcn_icmp: {
8113 if (!Src1Const || Src1Const->Value != 0)
8117 int64_t Pred =
MI.getOperand(4).getImm();
8123 B.buildIntrinsic(Intrinsic::amdgcn_ballot, Dst).addUse(Src0);
8124 MI.eraseFromParent();
8127 case Intrinsic::sponentry:
8133 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8136 B.buildIntToPtr(DstReg, TmpReg);
8137 MI.eraseFromParent();
8139 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8141 B.buildFrameIndex(
MI.getOperand(0), FI);
8142 MI.eraseFromParent();
8145 case Intrinsic::amdgcn_if:
8146 case Intrinsic::amdgcn_else: {
8149 bool Negated =
false;
8161 std::swap(CondBrTarget, UncondBrTarget);
8163 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8164 if (IntrID == Intrinsic::amdgcn_if) {
8165 B.buildInstr(AMDGPU::SI_IF)
8168 .addMBB(UncondBrTarget);
8170 B.buildInstr(AMDGPU::SI_ELSE)
8173 .addMBB(UncondBrTarget);
8182 B.buildBr(*CondBrTarget);
8187 MI.eraseFromParent();
8188 BrCond->eraseFromParent();
8194 case Intrinsic::amdgcn_loop: {
8197 bool Negated =
false;
8207 std::swap(CondBrTarget, UncondBrTarget);
8209 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8210 B.buildInstr(AMDGPU::SI_LOOP)
8212 .addMBB(UncondBrTarget);
8217 B.buildBr(*CondBrTarget);
8219 MI.eraseFromParent();
8220 BrCond->eraseFromParent();
8227 case Intrinsic::amdgcn_wave_reduce_min:
8228 case Intrinsic::amdgcn_wave_reduce_umin:
8229 case Intrinsic::amdgcn_wave_reduce_max:
8230 case Intrinsic::amdgcn_wave_reduce_umax:
8231 case Intrinsic::amdgcn_wave_reduce_add:
8232 case Intrinsic::amdgcn_wave_reduce_sub:
8233 case Intrinsic::amdgcn_wave_reduce_and:
8234 case Intrinsic::amdgcn_wave_reduce_or:
8235 case Intrinsic::amdgcn_wave_reduce_xor: {
8240 bool NeedsSignExt = IntrID == Intrinsic::amdgcn_wave_reduce_min ||
8241 IntrID == Intrinsic::amdgcn_wave_reduce_max ||
8242 IntrID == Intrinsic::amdgcn_wave_reduce_add ||
8243 IntrID == Intrinsic::amdgcn_wave_reduce_sub;
8244 auto Ext = NeedsSignExt ?
B.buildSExt(
LLT::scalar(32), SrcReg)
8249 .addUse(Ext.getReg(0))
8250 .addImm(
MI.getOperand(3).getImm());
8251 B.buildTrunc(DstReg, NewDst);
8252 MI.eraseFromParent();
8255 case Intrinsic::amdgcn_addrspacecast_nonnull:
8257 case Intrinsic::amdgcn_make_buffer_rsrc:
8259 case Intrinsic::amdgcn_kernarg_segment_ptr:
8262 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8263 MI.eraseFromParent();
8269 case Intrinsic::amdgcn_implicitarg_ptr:
8271 case Intrinsic::amdgcn_workitem_id_x:
8274 case Intrinsic::amdgcn_workitem_id_y:
8277 case Intrinsic::amdgcn_workitem_id_z:
8280 case Intrinsic::amdgcn_workgroup_id_x:
8285 case Intrinsic::amdgcn_workgroup_id_y:
8290 case Intrinsic::amdgcn_workgroup_id_z:
8295 case Intrinsic::amdgcn_cluster_id_x:
8296 return ST.hasClusters() &&
8299 case Intrinsic::amdgcn_cluster_id_y:
8300 return ST.hasClusters() &&
8303 case Intrinsic::amdgcn_cluster_id_z:
8304 return ST.hasClusters() &&
8307 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8308 return ST.hasClusters() &&
8311 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8312 return ST.hasClusters() &&
8315 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8316 return ST.hasClusters() &&
8319 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8320 return ST.hasClusters() &&
8322 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8323 return ST.hasClusters() &&
8326 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8327 return ST.hasClusters() &&
8330 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8331 return ST.hasClusters() &&
8334 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8335 return ST.hasClusters() &&
8339 case Intrinsic::amdgcn_wave_id:
8341 case Intrinsic::amdgcn_lds_kernel_id:
8344 case Intrinsic::amdgcn_dispatch_ptr:
8347 case Intrinsic::amdgcn_queue_ptr:
8350 case Intrinsic::amdgcn_implicit_buffer_ptr:
8353 case Intrinsic::amdgcn_dispatch_id:
8356 case Intrinsic::r600_read_ngroups_x:
8360 case Intrinsic::r600_read_ngroups_y:
8363 case Intrinsic::r600_read_ngroups_z:
8366 case Intrinsic::r600_read_local_size_x:
8369 case Intrinsic::r600_read_local_size_y:
8373 case Intrinsic::r600_read_local_size_z:
8376 case Intrinsic::amdgcn_fdiv_fast:
8378 case Intrinsic::amdgcn_is_shared:
8380 case Intrinsic::amdgcn_is_private:
8382 case Intrinsic::amdgcn_wavefrontsize: {
8383 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8384 MI.eraseFromParent();
8387 case Intrinsic::amdgcn_s_buffer_load:
8389 case Intrinsic::amdgcn_raw_buffer_store:
8390 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8391 case Intrinsic::amdgcn_struct_buffer_store:
8392 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8394 case Intrinsic::amdgcn_raw_buffer_store_format:
8395 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8396 case Intrinsic::amdgcn_struct_buffer_store_format:
8397 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8399 case Intrinsic::amdgcn_raw_tbuffer_store:
8400 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8401 case Intrinsic::amdgcn_struct_tbuffer_store:
8402 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8404 case Intrinsic::amdgcn_raw_buffer_load:
8405 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8406 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8407 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8408 case Intrinsic::amdgcn_struct_buffer_load:
8409 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8410 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8411 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8413 case Intrinsic::amdgcn_raw_buffer_load_format:
8414 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8415 case Intrinsic::amdgcn_struct_buffer_load_format:
8416 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8418 case Intrinsic::amdgcn_raw_tbuffer_load:
8419 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8420 case Intrinsic::amdgcn_struct_tbuffer_load:
8421 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8423 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8424 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8425 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8426 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8427 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8428 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8429 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8430 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8431 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8432 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8433 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8434 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8435 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8436 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8437 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8438 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8439 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8440 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8441 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8442 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8443 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8445 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8447 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8448 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8449 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8450 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8451 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8453 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8454 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8455 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8457 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8458 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8459 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8461 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8462 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8463 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8464 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8465 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8466 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8467 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8469 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8470 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8471 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8473 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8475 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8476 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8477 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8479 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8481 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8483 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8484 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8485 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8487 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8488 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8489 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8491 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8492 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8493 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8496 case Intrinsic::amdgcn_rsq_clamp:
8498 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8500 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8501 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8503 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8504 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8505 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8506 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8507 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8508 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8509 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8510 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8514 if (IndexArgTy !=
S64) {
8515 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8516 :
B.buildAnyExt(
S64, Index);
8517 MI.getOperand(5).setReg(NewIndex.getReg(0));
8521 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8522 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8523 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8524 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8525 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8532 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8535 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8536 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8537 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8538 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8539 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8540 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8542 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8543 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8545 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8549 if (IndexArgTy != IdxTy) {
8550 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8551 :
B.buildAnyExt(IdxTy, Index);
8552 MI.getOperand(7).setReg(NewIndex.getReg(0));
8557 case Intrinsic::amdgcn_fmed3: {
8563 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8564 MI.removeOperand(1);
8568 case Intrinsic::amdgcn_readlane:
8569 case Intrinsic::amdgcn_writelane:
8570 case Intrinsic::amdgcn_readfirstlane:
8571 case Intrinsic::amdgcn_permlane16:
8572 case Intrinsic::amdgcn_permlanex16:
8573 case Intrinsic::amdgcn_permlane64:
8574 case Intrinsic::amdgcn_set_inactive:
8575 case Intrinsic::amdgcn_set_inactive_chain_arg:
8576 case Intrinsic::amdgcn_mov_dpp8:
8577 case Intrinsic::amdgcn_update_dpp:
8578 case Intrinsic::amdgcn_permlane_bcast:
8579 case Intrinsic::amdgcn_permlane_up:
8580 case Intrinsic::amdgcn_permlane_down:
8581 case Intrinsic::amdgcn_permlane_xor:
8583 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8585 case Intrinsic::amdgcn_dead: {
8589 MI.eraseFromParent();
8592 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8593 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8594 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8595 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8596 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8597 MI.eraseFromParent();
8599 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8600 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8601 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8602 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8603 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8604 MI.eraseFromParent();
8606 case Intrinsic::amdgcn_av_load_b128:
8607 case Intrinsic::amdgcn_av_store_b128: {
8609 if (!ST.hasFlatGlobalInsts()) {
8610 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8611 ?
"llvm.amdgcn.av.load.b128"
8612 :
"llvm.amdgcn.av.store.b128";
8615 Fn,
Twine(Name) +
" not supported on subtarget",
MI.getDebugLoc()));
8618 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8619 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8620 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8622 B.buildStore(
MI.getOperand(2),
MI.getOperand(1),
8623 **
MI.memoperands_begin());
8624 MI.eraseFromParent();
8627 case Intrinsic::amdgcn_flat_load_monitor_b32:
8628 case Intrinsic::amdgcn_flat_load_monitor_b64:
8629 case Intrinsic::amdgcn_flat_load_monitor_b128:
8630 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8631 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8632 .add(
MI.getOperand(0))
8633 .add(
MI.getOperand(2))
8634 .addMemOperand(*
MI.memoperands_begin());
8635 MI.eraseFromParent();
8637 case Intrinsic::amdgcn_global_load_monitor_b32:
8638 case Intrinsic::amdgcn_global_load_monitor_b64:
8639 case Intrinsic::amdgcn_global_load_monitor_b128:
8640 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8641 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8642 .add(
MI.getOperand(0))
8643 .add(
MI.getOperand(2))
8644 .addMemOperand(*
MI.memoperands_begin());
8645 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.