37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
40#define DEBUG_TYPE "amdgpu-legalinfo"
50 "amdgpu-global-isel-new-legality",
51 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
67 unsigned Bits = Ty.getSizeInBits();
77 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
91 const LLT Ty = Query.Types[TypeIdx];
98 const LLT Ty = Query.Types[TypeIdx];
100 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
106 const LLT Ty = Query.Types[TypeIdx];
108 return std::pair(TypeIdx,
115 const LLT Ty = Query.Types[TypeIdx];
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (
Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
129 const LLT Ty = Query.Types[TypeIdx];
132 const int Size = Ty.getSizeInBits();
134 const int NextMul32 = (
Size + 31) / 32;
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
154 const LLT Ty = Query.Types[TypeIdx];
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
159 assert(EltSize == 32 || EltSize == 64);
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
168 return std::pair(TypeIdx,
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
188 const unsigned Size = Ty.getSizeInBits();
201 const LLT Ty = Query.Types[TypeIdx];
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
218 const LLT QueryTy = Query.Types[TypeIdx];
225 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT QueryTy = Query.Types[TypeIdx];
238 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
244 return EltSize == 16 || EltSize % 32 == 0;
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
280 LLT Ty = Query.Types[TypeIdx];
288 const LLT QueryTy = Query.Types[TypeIdx];
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
378 (ST.useRealTrue16Insts() && Ty ==
S16) ||
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
413 bool IsLoad,
bool IsAtomic) {
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
430 return IsLoad ? 512 : 128;
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
444 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
446 unsigned RegSize = Ty.getSizeInBits();
449 unsigned AS = Query.
Types[1].getAddressSpace();
456 if (Ty.isVector() && MemSize !=
RegSize)
463 if (IsLoad && MemSize <
Size)
464 MemSize = std::max(MemSize,
Align);
484 if (!ST.hasDwordx3LoadStores())
497 if (AlignBits < MemSize) {
500 Align(AlignBits / 8)))
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
556 const unsigned Size = Ty.getSizeInBits();
557 if (
Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
564 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
573 uint64_t AlignInBits,
unsigned AddrSpace,
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
594 if (AlignInBits < RoundedSize)
601 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
613 Query.
Types[1].getAddressSpace(), Opcode);
633 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
639 for (
unsigned I = 0;
I < NumParts; ++
I)
641 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
647 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
648 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
669 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
670 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
671 for (
unsigned I = 0;
I < NumParts; ++
I)
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
675 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
695 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
708 const LLT BufferStridedPtr =
711 const LLT CodePtr = FlatPtr;
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
723 const std::initializer_list<LLT> FPTypesBase = {
727 const std::initializer_list<LLT> FPTypes16 = {
731 const std::initializer_list<LLT> FPTypesPK16 = {
735 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
758 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
760 if (ST.hasScalarAddSub64()) {
763 .clampMaxNumElementsStrict(0,
S16, 2)
771 .clampMaxNumElementsStrict(0,
S16, 2)
778 if (ST.hasScalarSMulU64()) {
781 .clampMaxNumElementsStrict(0,
S16, 2)
789 .clampMaxNumElementsStrict(0,
S16, 2)
799 .minScalarOrElt(0,
S16)
804 }
else if (ST.has16BitInsts()) {
838 .widenScalarToNextMultipleOf(0, 32)
848 if (ST.hasMad64_32())
853 if (ST.hasIntClamp()) {
876 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
886 if (ST.hasVOP3PInsts()) {
888 .clampMaxNumElements(0,
S8, 2)
909 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
921 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
928 .clampScalar(0,
S16,
S64);
961 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
962 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
969 if (ST.has16BitInsts()) {
970 if (ST.hasVOP3PInsts())
973 FPOpActions.legalFor({
S16});
975 TrigActions.customFor({
S16});
976 FDIVActions.customFor({
S16});
979 if (ST.hasPackedFP32Ops()) {
980 FPOpActions.legalFor({
V2S32});
981 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
984 auto &MinNumMaxNumIeee =
987 if (ST.hasVOP3PInsts()) {
988 MinNumMaxNumIeee.legalFor(FPTypesPK16)
990 .clampMaxNumElements(0,
S16, 2)
993 }
else if (ST.has16BitInsts()) {
994 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
996 MinNumMaxNumIeee.legalFor(FPTypesBase)
1002 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1004 if (ST.hasVOP3PInsts()) {
1005 MinNumMaxNum.customFor(FPTypesPK16)
1007 .clampMaxNumElements(0,
S16, 2)
1008 .clampScalar(0,
S16,
S64)
1010 }
else if (ST.has16BitInsts()) {
1011 MinNumMaxNum.customFor(FPTypes16)
1012 .clampScalar(0,
S16,
S64)
1015 MinNumMaxNum.customFor(FPTypesBase)
1016 .clampScalar(0,
S32,
S64)
1020 if (ST.hasVOP3PInsts())
1037 .
legalFor(ST.hasPackedFP32Ops(), {V2S32})
1039 if (ST.hasPackedFP32Ops())
1043 if (ST.has16BitInsts()) {
1077 if (ST.hasFractBug()) {
1111 if (ST.hasCvtPkF16F32Inst()) {
1113 .clampMaxNumElements(0,
S16, 2);
1117 FPTruncActions.scalarize(0).lower();
1125 if (ST.has16BitInsts()) {
1139 if (ST.hasPackedFP32Ops())
1149 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1150 FMad.customFor({
S32,
S16});
1151 else if (ST.hasMadMacF32Insts())
1152 FMad.customFor({
S32});
1153 else if (ST.hasMadF16())
1154 FMad.customFor({
S16});
1159 if (ST.has16BitInsts()) {
1162 FRem.minScalar(0,
S32)
1171 .clampMaxNumElements(0,
S16, 2)
1190 if (ST.has16BitInsts())
1201 if (ST.has16BitInsts())
1214 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1218 if (
ST.has16BitInsts())
1228 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1229 .clampScalar(0,
S16,
S64)
1233 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1239 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1243 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1244 .clampScalar(0,
S16,
S64)
1248 if (
ST.has16BitInsts()) {
1249 getActionDefinitionsBuilder(
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1252 .clampScalar(0,
S16,
S64)
1255 getActionDefinitionsBuilder(
1256 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1258 .clampScalar(0,
S32,
S64)
1261 getActionDefinitionsBuilder(
1262 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1265 .clampScalar(0,
S32,
S64)
1269 getActionDefinitionsBuilder(G_PTR_ADD)
1275 getActionDefinitionsBuilder(G_PTRMASK)
1277 .scalarSameSizeAs(1, 0)
1281 getActionDefinitionsBuilder(G_ICMP)
1293 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1294 .legalForCartesianProduct(
1295 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1296 if (
ST.has16BitInsts()) {
1297 CmpBuilder.legalFor({{
S1,
S16}});
1308 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1310 if (
ST.hasSALUFloatInsts())
1319 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1320 if (
ST.has16BitInsts())
1321 ExpOps.customFor({{
S32}, {
S16}});
1323 ExpOps.customFor({
S32});
1324 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1327 getActionDefinitionsBuilder(G_FPOWI)
1328 .clampScalar(0, MinScalarFPTy,
S32)
1331 getActionDefinitionsBuilder(G_FLOG2)
1332 .legalFor(
ST.has16BitInsts(), {S16})
1337 getActionDefinitionsBuilder(G_FEXP2)
1338 .legalFor(
ST.has16BitInsts(), {S16})
1344 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1346 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1350 getActionDefinitionsBuilder(G_CTPOP)
1352 .clampScalar(0,
S32,
S32)
1353 .widenScalarToNextPow2(1, 32)
1354 .clampScalar(1,
S32,
S64)
1356 .widenScalarToNextPow2(0, 32);
1359 if (
ST.has16BitInsts())
1360 getActionDefinitionsBuilder(G_IS_FPCLASS)
1361 .legalForCartesianProduct({
S1}, FPTypes16)
1362 .widenScalarToNextPow2(1)
1366 getActionDefinitionsBuilder(G_IS_FPCLASS)
1367 .legalForCartesianProduct({
S1}, FPTypesBase)
1368 .lowerFor({
S1,
S16})
1369 .widenScalarToNextPow2(1)
1376 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1378 .clampScalar(0,
S32,
S32)
1379 .clampScalar(1,
S32,
S64)
1380 .widenScalarToNextPow2(0, 32)
1381 .widenScalarToNextPow2(1, 32)
1385 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1388 .clampScalar(0,
S32,
S32)
1389 .clampScalar(1,
S32,
S64)
1391 .widenScalarToNextPow2(0, 32)
1392 .widenScalarToNextPow2(1, 32);
1394 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1396 .clampScalar(0,
S32,
S32)
1397 .clampScalar(1,
S32,
S64)
1399 .widenScalarToNextPow2(0, 32)
1400 .widenScalarToNextPow2(1, 32);
1402 getActionDefinitionsBuilder(G_CTLS)
1405 .clampScalar(0,
S32,
S32)
1406 .clampScalar(1,
S32,
S32);
1410 getActionDefinitionsBuilder(G_BITREVERSE)
1412 .clampScalar(0,
S32,
S64)
1414 .widenScalarToNextPow2(0);
1416 if (
ST.has16BitInsts()) {
1417 getActionDefinitionsBuilder(G_BSWAP)
1419 .clampMaxNumElementsStrict(0,
S16, 2)
1422 .widenScalarToNextPow2(0)
1423 .clampScalar(0,
S16,
S32)
1426 if (
ST.hasVOP3PInsts()) {
1427 getActionDefinitionsBuilder(G_ABS)
1429 .clampMaxNumElements(0,
S16, 2)
1431 .widenScalarToNextPow2(0)
1434 if (
ST.hasIntMinMax64()) {
1435 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1437 .clampMaxNumElements(0,
S16, 2)
1439 .widenScalarToNextPow2(0)
1443 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1445 .clampMaxNumElements(0,
S16, 2)
1447 .widenScalarToNextPow2(0)
1452 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1454 .widenScalarToNextPow2(0)
1461 getActionDefinitionsBuilder(G_BSWAP)
1466 .widenScalarToNextPow2(0)
1471 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1474 .widenScalarToNextPow2(0)
1479 getActionDefinitionsBuilder(G_INTTOPTR)
1481 .legalForCartesianProduct(AddrSpaces64, {
S64})
1482 .legalForCartesianProduct(AddrSpaces32, {
S32})
1495 getActionDefinitionsBuilder(G_PTRTOINT)
1497 .legalForCartesianProduct(AddrSpaces64, {
S64})
1498 .legalForCartesianProduct(AddrSpaces32, {
S32})
1511 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1515 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1516 bool IsLoad) ->
bool {
1520 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1534 unsigned NumRegs = (MemSize + 31) / 32;
1536 if (!
ST.hasDwordx3LoadStores())
1547 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1548 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1549 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1555 for (
unsigned Op : {G_LOAD, G_STORE}) {
1556 const bool IsStore =
Op == G_STORE;
1558 auto &Actions = getActionDefinitionsBuilder(
Op);
1561 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1564 {
S64, GlobalPtr,
S64, GlobalAlign32},
1567 {
S32, GlobalPtr,
S8, GlobalAlign8},
1568 {
S32, GlobalPtr,
S16, GlobalAlign16},
1570 {
S32, LocalPtr,
S32, 32},
1571 {
S64, LocalPtr,
S64, 32},
1573 {
S32, LocalPtr,
S8, 8},
1574 {
S32, LocalPtr,
S16, 16},
1577 {
S32, PrivatePtr,
S32, 32},
1578 {
S32, PrivatePtr,
S8, 8},
1579 {
S32, PrivatePtr,
S16, 16},
1582 {
S32, ConstantPtr,
S32, GlobalAlign32},
1585 {
S64, ConstantPtr,
S64, GlobalAlign32},
1586 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1588 Actions.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1589 {{S16, GlobalPtr, S8, GlobalAlign8},
1590 {S16, GlobalPtr, S16, GlobalAlign16},
1591 {S16, LocalPtr, S8, 8},
1592 {S16, LocalPtr, S16, 16},
1593 {S16, PrivatePtr, S8, 8},
1594 {S16, PrivatePtr, S16, 16}});
1604 Actions.unsupportedIf(
1605 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1619 Actions.customIf(
typeIs(1, Constant32Ptr));
1645 return !Query.
Types[0].isVector() &&
1646 needToSplitMemOp(Query,
Op == G_LOAD);
1648 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1653 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1656 if (DstSize > MemSize)
1662 if (MemSize > MaxSize)
1670 return Query.
Types[0].isVector() &&
1671 needToSplitMemOp(Query,
Op == G_LOAD);
1673 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1687 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1688 if (MemSize > MaxSize) {
1692 if (MaxSize % EltSize == 0) {
1698 unsigned NumPieces = MemSize / MaxSize;
1702 if (NumPieces == 1 || NumPieces >= NumElts ||
1703 NumElts % NumPieces != 0)
1704 return std::pair(0, EltTy);
1712 return std::pair(0, EltTy);
1727 return std::pair(0, EltTy);
1732 .widenScalarToNextPow2(0)
1739 getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1740 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1741 {
S32, GlobalPtr,
S16, 2 * 8},
1742 {
S32, LocalPtr,
S8, 8},
1743 {
S32, LocalPtr,
S16, 16},
1744 {
S32, PrivatePtr,
S8, 8},
1745 {
S32, PrivatePtr,
S16, 16},
1746 {
S32, ConstantPtr,
S8, 8},
1747 {
S32, ConstantPtr,
S16, 2 * 8}})
1748 .legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1749 {{S16, GlobalPtr, S8, GlobalAlign8},
1750 {S16, LocalPtr, S8, GlobalAlign8},
1751 {S16, PrivatePtr, S8, GlobalAlign8},
1752 {S16, ConstantPtr, S8, GlobalAlign8}})
1757 if (
ST.hasFlatAddressSpace()) {
1758 ExtLoads.legalForTypesWithMemDesc(
1759 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1761 ExtLoads.legalForTypesWithMemDesc(
ST.useRealTrue16Insts(),
1762 {{S16, FlatPtr, S8, GlobalAlign8}});
1770 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1772 ExtLoads.narrowScalarIf(
1779 ExtLoads.clampScalar(0,
S32,
S32)
1780 .widenScalarToNextPow2(0)
1783 auto &Atomics = getActionDefinitionsBuilder(
1784 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1785 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1786 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1787 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1788 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1789 {
S64, GlobalPtr}, {
S64, LocalPtr},
1790 {
S32, RegionPtr}, {
S64, RegionPtr}});
1791 if (
ST.hasFlatAddressSpace()) {
1792 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1796 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1797 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1798 if (
ST.hasFlatAddressSpace()) {
1799 Atomics32.legalFor({{
S32, FlatPtr}});
1803 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1804 if (
ST.hasLDSFPAtomicAddF32()) {
1805 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1806 if (
ST.hasLdsAtomicAddF64())
1807 Atomic.legalFor({{
S64, LocalPtr}});
1808 if (
ST.hasAtomicDsPkAdd16Insts())
1809 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1811 if (
ST.hasAtomicFaddInsts())
1812 Atomic.legalFor({{
S32, GlobalPtr}});
1813 if (
ST.hasFlatAtomicFaddF32Inst())
1814 Atomic.legalFor({{
S32, FlatPtr}});
1816 if (
ST.hasGFX90AInsts() ||
ST.hasGFX1250Insts()) {
1827 if (
ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1828 ST.hasAtomicBufferGlobalPkAddF16Insts())
1829 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1830 if (
ST.hasAtomicGlobalPkAddBF16Inst())
1831 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1832 if (
ST.hasAtomicFlatPkAdd16Insts())
1833 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1838 auto &AtomicFMinFMax =
1839 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1840 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1842 if (
ST.hasAtomicFMinFMaxF32GlobalInsts())
1843 AtomicFMinFMax.legalFor({{
F32, GlobalPtr},{
F32, BufferFatPtr}});
1844 if (
ST.hasAtomicFMinFMaxF64GlobalInsts())
1845 AtomicFMinFMax.legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1846 if (
ST.hasAtomicFMinFMaxF32FlatInsts())
1847 AtomicFMinFMax.legalFor({
F32, FlatPtr});
1848 if (
ST.hasAtomicFMinFMaxF64FlatInsts())
1849 AtomicFMinFMax.legalFor({
F64, FlatPtr});
1853 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1854 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1855 {
S32, FlatPtr}, {
S64, FlatPtr}})
1856 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1857 {
S32, RegionPtr}, {
S64, RegionPtr}});
1861 getActionDefinitionsBuilder(G_SELECT)
1863 LocalPtr, FlatPtr, PrivatePtr,
1867 .clampScalar(0,
S16,
S64)
1871 .clampMaxNumElements(0,
S32, 2)
1872 .clampMaxNumElements(0, LocalPtr, 2)
1873 .clampMaxNumElements(0, PrivatePtr, 2)
1875 .widenScalarToNextPow2(0)
1880 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1882 if (
ST.has16BitInsts()) {
1883 if (
ST.hasVOP3PInsts()) {
1885 .clampMaxNumElements(0,
S16, 2);
1887 Shifts.legalFor({{
S16,
S16}});
1890 Shifts.widenScalarIf(
1895 const LLT AmountTy = Query.
Types[1];
1900 Shifts.clampScalar(1,
S32,
S32);
1901 Shifts.widenScalarToNextPow2(0, 16);
1902 Shifts.clampScalar(0,
S16,
S64);
1904 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1912 Shifts.clampScalar(1,
S32,
S32);
1913 Shifts.widenScalarToNextPow2(0, 32);
1914 Shifts.clampScalar(0,
S32,
S64);
1916 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1921 Shifts.scalarize(0);
1923 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1924 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1925 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1926 unsigned IdxTypeIdx = 2;
1928 getActionDefinitionsBuilder(
Op)
1930 const LLT EltTy = Query.
Types[EltTypeIdx];
1931 const LLT VecTy = Query.
Types[VecTypeIdx];
1932 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1934 const bool isLegalVecType =
1944 return (EltSize == 32 || EltSize == 64) &&
1960 const LLT EltTy = Query.
Types[EltTypeIdx];
1961 const LLT VecTy = Query.
Types[VecTypeIdx];
1965 const unsigned TargetEltSize =
1966 DstEltSize % 64 == 0 ? 64 : 32;
1967 return std::pair(VecTypeIdx,
1971 .clampScalar(EltTypeIdx,
S32,
S64)
1972 .clampScalar(VecTypeIdx,
S32,
S64)
1973 .clampScalar(IdxTypeIdx,
S32,
S32)
1974 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1983 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1985 const LLT &EltTy = Query.
Types[1].getElementType();
1986 return Query.
Types[0] != EltTy;
1989 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1990 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1991 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1992 getActionDefinitionsBuilder(
Op)
1995 const LLT BigTy = Query.
Types[BigTyIdx];
2001 const LLT LitTy = Query.
Types[LitTyIdx];
2006 .widenScalarToNextPow2(BigTyIdx, 32)
2014 const LLT BigTy = Query.
Types[BigTyIdx];
2015 const LLT LitTy = Query.
Types[LitTyIdx];
2023 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2032 if (
ST.hasScalarPackInsts()) {
2035 .minScalarOrElt(0,
S16)
2038 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2042 BuildVector.customFor({
V2S16,
S16});
2043 BuildVector.minScalarOrElt(0,
S32);
2045 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2053 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2055 .clampMaxNumElements(0,
S32, 32)
2056 .clampMaxNumElements(1,
S16, 2)
2057 .clampMaxNumElements(0,
S16, 64);
2059 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2062 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2063 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2064 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2066 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2067 const LLT Ty = Query.
Types[TypeIdx];
2079 getActionDefinitionsBuilder(
Op)
2083 const LLT BigTy = Query.
Types[BigTyIdx];
2089 .widenScalarToNextPow2(LitTyIdx, 16)
2098 .clampScalar(LitTyIdx,
S32,
S512)
2099 .widenScalarToNextPow2(LitTyIdx, 32)
2103 return notValidElt(Query, LitTyIdx);
2108 return notValidElt(Query, BigTyIdx);
2113 if (
Op == G_MERGE_VALUES) {
2114 Builder.widenScalarIf(
2117 const LLT Ty = Query.
Types[LitTyIdx];
2123 Builder.widenScalarIf(
2125 const LLT Ty = Query.
Types[BigTyIdx];
2131 const LLT &Ty = Query.
Types[BigTyIdx];
2133 if (NewSizeInBits >= 256) {
2135 if (RoundedTo < NewSizeInBits)
2136 NewSizeInBits = RoundedTo;
2138 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2147 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2148 .legalFor({{
S32}, {
S64}})
2149 .clampScalar(0,
S32,
S64);
2151 if (
ST.hasVOP3PInsts()) {
2152 SextInReg.lowerFor({{
V2S16}})
2156 .clampMaxNumElementsStrict(0,
S16, 2);
2157 }
else if (
ST.has16BitInsts()) {
2158 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2162 SextInReg.lowerFor({{
S32}, {
S64}});
2167 .clampScalar(0,
S32,
S64)
2170 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2174 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2175 FSHRActionDefs.legalFor({{
S32,
S32}})
2176 .clampMaxNumElementsStrict(0,
S16, 2);
2177 if (
ST.hasVOP3PInsts())
2179 FSHRActionDefs.scalarize(0).lower();
2181 if (
ST.hasVOP3PInsts()) {
2182 getActionDefinitionsBuilder(G_FSHL)
2184 .clampMaxNumElementsStrict(0,
S16, 2)
2188 getActionDefinitionsBuilder(G_FSHL)
2193 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2196 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
2198 getActionDefinitionsBuilder(G_FENCE)
2201 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2206 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2208 .clampScalar(1,
S32,
S32)
2209 .clampScalar(0,
S32,
S64)
2210 .widenScalarToNextPow2(0)
2213 getActionDefinitionsBuilder(
2217 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2218 G_READ_REGISTER, G_WRITE_REGISTER,
2223 if (
ST.hasIEEEMinimumMaximumInsts()) {
2224 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2225 .legalFor(FPTypesPK16)
2226 .clampMaxNumElements(0,
S16, 2)
2228 }
else if (
ST.hasVOP3PInsts()) {
2229 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2231 .clampMaxNumElementsStrict(0,
S16, 2)
2235 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2237 .clampScalar(0,
S32,
S64)
2241 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2244 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2246 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2247 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2248 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2251 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2253 getActionDefinitionsBuilder(
2254 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2255 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2256 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2257 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2262 getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS,
2263 G_INTRINSIC_CONVERGENT,
2264 G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS})
2267 getLegacyLegalizerInfo().computeTables();
2277 switch (
MI.getOpcode()) {
2278 case TargetOpcode::G_ADDRSPACE_CAST:
2280 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2282 case TargetOpcode::G_FCEIL:
2284 case TargetOpcode::G_FREM:
2286 case TargetOpcode::G_INTRINSIC_TRUNC:
2288 case TargetOpcode::G_SITOFP:
2290 case TargetOpcode::G_UITOFP:
2292 case TargetOpcode::G_FPTOSI:
2294 case TargetOpcode::G_FPTOUI:
2296 case TargetOpcode::G_FMINNUM:
2297 case TargetOpcode::G_FMAXNUM:
2298 case TargetOpcode::G_FMINIMUMNUM:
2299 case TargetOpcode::G_FMAXIMUMNUM:
2301 case TargetOpcode::G_EXTRACT:
2303 case TargetOpcode::G_INSERT:
2305 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2307 case TargetOpcode::G_INSERT_VECTOR_ELT:
2309 case TargetOpcode::G_FSIN:
2310 case TargetOpcode::G_FCOS:
2312 case TargetOpcode::G_GLOBAL_VALUE:
2314 case TargetOpcode::G_LOAD:
2315 case TargetOpcode::G_SEXTLOAD:
2316 case TargetOpcode::G_ZEXTLOAD:
2318 case TargetOpcode::G_STORE:
2320 case TargetOpcode::G_FMAD:
2322 case TargetOpcode::G_FDIV:
2324 case TargetOpcode::G_FFREXP:
2326 case TargetOpcode::G_FSQRT:
2328 case TargetOpcode::G_UDIV:
2329 case TargetOpcode::G_UREM:
2330 case TargetOpcode::G_UDIVREM:
2332 case TargetOpcode::G_SDIV:
2333 case TargetOpcode::G_SREM:
2334 case TargetOpcode::G_SDIVREM:
2336 case TargetOpcode::G_ATOMIC_CMPXCHG:
2338 case TargetOpcode::G_FLOG2:
2340 case TargetOpcode::G_FLOG:
2341 case TargetOpcode::G_FLOG10:
2343 case TargetOpcode::G_FEXP2:
2345 case TargetOpcode::G_FEXP:
2346 case TargetOpcode::G_FEXP10:
2348 case TargetOpcode::G_FPOW:
2350 case TargetOpcode::G_FFLOOR:
2352 case TargetOpcode::G_BUILD_VECTOR:
2353 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2355 case TargetOpcode::G_MUL:
2357 case TargetOpcode::G_CTLZ:
2358 case TargetOpcode::G_CTTZ:
2360 case TargetOpcode::G_CTLS:
2362 case TargetOpcode::G_CTLZ_ZERO_POISON:
2364 case TargetOpcode::G_STACKSAVE:
2366 case TargetOpcode::G_GET_FPENV:
2368 case TargetOpcode::G_SET_FPENV:
2370 case TargetOpcode::G_TRAP:
2372 case TargetOpcode::G_DEBUGTRAP:
2392 if (ST.hasApertureRegs()) {
2397 ? AMDGPU::SRC_SHARED_BASE
2398 : AMDGPU::SRC_PRIVATE_BASE;
2399 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2400 !ST.hasGloballyAddressableScratch()) &&
2401 "Cannot use src_private_base with globally addressable scratch!");
2404 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2405 return B.buildUnmerge(
S32, Dst).getReg(1);
2420 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2436 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2439 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2461 B.buildObjectPtrOffset(
2463 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2464 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2472 switch (Def->getOpcode()) {
2473 case AMDGPU::G_FRAME_INDEX:
2474 case AMDGPU::G_GLOBAL_VALUE:
2475 case AMDGPU::G_BLOCK_ADDR:
2477 case AMDGPU::G_CONSTANT: {
2478 const ConstantInt *CI = Def->getOperand(1).getCImm();
2495 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2497 Intrinsic::amdgcn_addrspacecast_nonnull));
2502 :
MI.getOperand(1).getReg();
2506 unsigned SrcAS = SrcTy.getAddressSpace();
2516 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2523 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2525 ST.hasGloballyAddressableScratch()) {
2529 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2531 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2532 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2534 MRI.
setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2536 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2540 return B.buildExtract(Dst, Src, 0).getReg(0);
2546 castFlatToLocalOrPrivate(Dst);
2547 MI.eraseFromParent();
2553 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2554 auto FlatNull =
B.buildConstant(SrcTy, 0);
2557 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2561 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2563 MI.eraseFromParent();
2570 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2573 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2576 ST.hasGloballyAddressableScratch()) {
2581 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2585 if (ST.isWave64()) {
2586 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2592 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2593 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2595 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2599 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2600 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2602 MRI.
setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2603 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2612 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2618 castLocalOrPrivateToFlat(Dst);
2619 MI.eraseFromParent();
2623 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2630 SegmentNull.getReg(0));
2632 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2634 MI.eraseFromParent();
2639 SrcTy.getSizeInBits() == 64) {
2641 B.buildExtract(Dst, Src, 0);
2642 MI.eraseFromParent();
2649 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2650 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2651 if (AddrHiVal == 0) {
2653 B.buildIntToPtr(Dst, Zext);
2655 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2656 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2659 MI.eraseFromParent();
2666 MI.eraseFromParent();
2675 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2680 auto C1 =
B.buildFConstant(Ty, C1Val);
2681 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2684 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2685 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2687 auto C2 =
B.buildFConstant(Ty, C2Val);
2688 auto Fabs =
B.buildFAbs(Ty, Src);
2691 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2692 MI.eraseFromParent();
2710 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2712 const auto Zero =
B.buildFConstant(
S64, 0.0);
2713 const auto One =
B.buildFConstant(
S64, 1.0);
2716 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2717 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2720 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2721 MI.eraseFromParent();
2729 Register Src0Reg =
MI.getOperand(1).getReg();
2730 Register Src1Reg =
MI.getOperand(2).getReg();
2731 auto Flags =
MI.getFlags();
2734 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2735 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2736 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2737 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2738 MI.eraseFromParent();
2744 const unsigned FractBits = 52;
2745 const unsigned ExpBits = 11;
2748 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2749 auto Const1 =
B.buildConstant(
S32, ExpBits);
2751 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2753 .addUse(Const0.getReg(0))
2754 .addUse(Const1.getReg(0));
2756 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2770 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2777 const unsigned FractBits = 52;
2780 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2781 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2783 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2785 const auto Zero32 =
B.buildConstant(
S32, 0);
2788 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2790 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2791 auto Not =
B.buildNot(
S64, Shr);
2792 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2793 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2798 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2799 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2800 MI.eraseFromParent();
2816 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2817 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2820 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2821 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2823 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2824 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2827 B.buildFAdd(Dst, LdExp, CvtLo);
2828 MI.eraseFromParent();
2834 auto One =
B.buildConstant(
S32, 1);
2838 auto ThirtyOne =
B.buildConstant(
S32, 31);
2839 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2840 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2841 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2842 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2843 .addUse(Unmerge.getReg(1));
2844 auto LS2 =
B.buildSub(
S32, LS, One);
2845 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2847 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2848 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2849 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2850 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2851 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2852 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2853 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2854 B.buildFLdexp(Dst, FVal, Scale);
2855 MI.eraseFromParent();
2875 unsigned Flags =
MI.getFlags();
2886 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2894 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2895 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2899 K0 =
B.buildFConstant(
2901 K1 =
B.buildFConstant(
2904 K0 =
B.buildFConstant(
2906 K1 =
B.buildFConstant(
2910 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2911 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2912 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2915 :
B.buildFPTOUI(
S32, FloorMul);
2916 auto Lo =
B.buildFPTOUI(
S32, Fma);
2920 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2922 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2925 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2926 MI.eraseFromParent();
2958 unsigned StartIdx =
Offset / 32;
2960 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32), SrcReg);
2962 if (DstCount == 1) {
2964 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2969 for (
unsigned I = 0;
I < DstCount; ++
I)
2970 MergeVec.
push_back(Unmerge.getReg(StartIdx +
I));
2971 B.buildMergeLikeInstr(DstReg, MergeVec);
2974 MI.eraseFromParent();
2984 Register InsertSrc =
MI.getOperand(2).getReg();
2993 if (
Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2997 unsigned DstCount = DstSize / 32;
2998 unsigned InsertCount = InsertSize / 32;
2999 unsigned StartIdx =
Offset / 32;
3001 auto SrcUnmerge =
B.buildUnmerge(
S32, SrcReg);
3004 for (
unsigned I = 0;
I < StartIdx; ++
I)
3007 if (InsertCount == 1) {
3011 InsertSrc =
B.buildPtrToInt(
S32, InsertSrc).getReg(0);
3014 auto InsertUnmerge =
B.buildUnmerge(
S32, InsertSrc);
3015 for (
unsigned I = 0;
I < InsertCount; ++
I)
3019 for (
unsigned I = StartIdx + InsertCount;
I < DstCount; ++
I)
3022 B.buildMergeLikeInstr(DstReg, MergeVec);
3024 MI.eraseFromParent();
3051 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
3052 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
3053 B.buildIntToPtr(Dst, IntElt);
3055 MI.eraseFromParent();
3062 std::optional<ValueAndVReg> MaybeIdxVal =
3066 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3069 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
3070 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3075 MI.eraseFromParent();
3104 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
3105 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
3106 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3108 B.buildIntToPtr(Dst, IntVecDest);
3109 MI.eraseFromParent();
3116 std::optional<ValueAndVReg> MaybeIdxVal =
3121 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3124 if (IdxVal < NumElts) {
3126 for (
unsigned i = 0; i < NumElts; ++i)
3128 B.buildUnmerge(SrcRegs, Vec);
3130 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
3131 B.buildMergeLikeInstr(Dst, SrcRegs);
3136 MI.eraseFromParent();
3147 unsigned Flags =
MI.getFlags();
3151 if (ST.hasTrigReducedRange()) {
3152 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3153 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3154 .addUse(MulVal.getReg(0))
3158 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3161 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3165 MI.eraseFromParent();
3173 unsigned GAFlags)
const {
3202 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3204 if (ST.has64BitLiterals()) {
3208 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3212 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3221 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3222 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3225 B.buildExtract(DstReg, PCReg, 0);
3235 if (RequiresHighHalf && ST.has64BitLiterals()) {
3237 MRI.
setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3238 B.buildInstr(AMDGPU::S_MOV_B64)
3253 MRI.
setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3256 B.buildInstr(AMDGPU::S_MOV_B32)
3261 if (RequiresHighHalf) {
3263 "Must provide a 64-bit pointer type!");
3266 MRI.
setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3268 B.buildInstr(AMDGPU::S_MOV_B32)
3279 MRI.
setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3281 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3285 if (AddrDst != DstReg)
3286 B.buildCast(DstReg, AddrDst);
3287 }
else if (AddrLo != DstReg) {
3290 B.buildCast(DstReg, AddrLo);
3299 unsigned AS = Ty.getAddressSpace();
3307 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3311 Fn,
"local memory global used by non-kernel function",
3320 B.buildUndef(DstReg);
3321 MI.eraseFromParent();
3345 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3346 B.buildIntToPtr(DstReg, Sz);
3347 MI.eraseFromParent();
3353 MI.eraseFromParent();
3357 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3359 MI.eraseFromParent();
3367 MI.eraseFromParent();
3373 MI.eraseFromParent();
3389 if (Ty.getSizeInBits() == 32) {
3391 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3392 B.buildExtract(DstReg, Load, 0);
3394 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3396 MI.eraseFromParent();
3419 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3421 MI.getOperand(1).setReg(Cast.getReg(0));
3426 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3452 if (WideMemSize == ValSize) {
3458 MI.setMemRefs(MF, {WideMMO});
3464 if (ValSize > WideMemSize)
3471 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3472 B.buildTrunc(ValReg, WideLoad).getReg(0);
3479 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3480 B.buildExtract(ValReg, WideLoad, 0);
3484 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3485 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3489 MI.eraseFromParent();
3502 Register DataReg =
MI.getOperand(0).getReg();
3547 "this should not have been custom lowered");
3552 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3554 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3558 .setMemRefs(
MI.memoperands());
3560 MI.eraseFromParent();
3568 switch (
DefMI->getOpcode()) {
3569 case TargetOpcode::G_INTRINSIC: {
3571 case Intrinsic::amdgcn_frexp_mant:
3572 case Intrinsic::amdgcn_log:
3573 case Intrinsic::amdgcn_log_clamp:
3574 case Intrinsic::amdgcn_exp2:
3575 case Intrinsic::amdgcn_sqrt:
3583 case TargetOpcode::G_FSQRT:
3585 case TargetOpcode::G_FFREXP: {
3586 if (
DefMI->getOperand(0).getReg() == Src)
3590 case TargetOpcode::G_FPEXT: {
3611std::pair<Register, Register>
3613 unsigned Flags)
const {
3618 auto SmallestNormal =
B.buildFConstant(
3620 auto IsLtSmallestNormal =
3623 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3624 auto One =
B.buildFConstant(
F32, 1.0);
3626 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3627 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3629 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3642 LLT Ty =
B.getMRI()->getType(Dst);
3643 unsigned Flags =
MI.getFlags();
3648 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3649 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3650 .addUse(Ext.getReg(0))
3652 B.buildFPTrunc(Dst,
Log2, Flags);
3653 MI.eraseFromParent();
3661 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3664 MI.eraseFromParent();
3668 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3669 .addUse(ScaledInput)
3672 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3673 auto Zero =
B.buildFConstant(Ty, 0.0);
3675 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3676 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3678 MI.eraseFromParent();
3684 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3685 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3690 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3691 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3696 unsigned Flags =
MI.getFlags();
3709 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3711 B.buildFPTrunc(Dst, LogVal);
3716 MI.eraseFromParent();
3725 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3728 if (ST.hasFastFMAF32()) {
3730 const float c_log10 = 0x1.344134p-2f;
3731 const float cc_log10 = 0x1.09f79ep-26f;
3734 const float c_log = 0x1.62e42ep-1f;
3735 const float cc_log = 0x1.efa39ep-25f;
3737 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3738 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3742 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3743 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3744 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3745 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3746 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3749 const float ch_log10 = 0x1.344000p-2f;
3750 const float ct_log10 = 0x1.3509f6p-18f;
3753 const float ch_log = 0x1.62e000p-1f;
3754 const float ct_log = 0x1.0bfbe8p-15f;
3756 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3757 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3759 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3760 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3761 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3765 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3768 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3770 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3773 const bool IsFiniteOnly =
3776 if (!IsFiniteOnly) {
3779 auto Fabs =
B.buildFAbs(Ty,
Y);
3782 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3786 auto Zero =
B.buildFConstant(Ty, 0.0);
3788 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3789 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3790 B.buildFSub(Dst, R, Shift, Flags);
3792 B.buildCopy(Dst, R);
3795 MI.eraseFromParent();
3801 unsigned Flags)
const {
3802 const double Log2BaseInverted =
3805 LLT Ty =
B.getMRI()->getType(Dst);
3810 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3813 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3814 auto Zero =
B.buildFConstant(Ty, 0.0);
3816 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3817 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3819 if (ST.hasFastFMAF32())
3820 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3822 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3823 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3831 ?
B.buildFLog2(Ty, Src, Flags)
3832 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3835 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3836 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3847 unsigned Flags =
MI.getFlags();
3848 LLT Ty =
B.getMRI()->getType(Dst);
3858 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3859 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3860 .addUse(Ext.getReg(0))
3862 B.buildFPTrunc(Dst,
Log2, Flags);
3863 MI.eraseFromParent();
3873 MI.eraseFromParent();
3881 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3883 RangeCheckConst, Flags);
3885 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3886 auto Zero =
B.buildFConstant(Ty, 0.0);
3887 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3888 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3890 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3891 .addUse(AddInput.getReg(0))
3894 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3895 auto One =
B.buildFConstant(Ty, 1.0);
3896 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3897 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3898 MI.eraseFromParent();
3903 const SrcOp &Src,
unsigned Flags) {
3904 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3907 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3908 .addUse(Src.getReg())
3911 return B.buildFExp2(Dst, Src, Flags);
3917 bool IsExp10)
const {
3918 LLT Ty =
B.getMRI()->getType(
X);
3922 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3923 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3930 LLT Ty =
B.getMRI()->getType(Dst);
3937 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3940 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3941 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3942 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3945 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3947 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3948 .addUse(ExpInput.getReg(0))
3951 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3952 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3953 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3959 unsigned Flags)
const {
3960 LLT Ty =
B.getMRI()->getType(Dst);
3965 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3966 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3968 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3969 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3970 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3971 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3972 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3982 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3986 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3987 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3988 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3990 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3991 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3993 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3994 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3995 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3996 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3998 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3999 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
4000 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
4002 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
4021 if (
MI.getOpcode() == TargetOpcode::G_FEXP2) {
4023 Dn =
B.buildFRint(
S64,
X, Flags).getReg(0);
4025 F =
B.buildFSub(
S64,
X, Dn, Flags).getReg(0);
4027 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4028 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4029 auto Mul2 =
B.buildFMul(
S64,
F, C2, Flags).getReg(0);
4030 T =
B.buildFMA(
S64,
F, C1, Mul2, Flags).getReg(0);
4032 }
else if (
MI.getOpcode() == TargetOpcode::G_FEXP10) {
4033 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.a934f0979a371p+1));
4034 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4035 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4037 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4038 auto C2 =
B.buildFConstant(
S64,
APFloat(-0x1.9dc1da994fd21p-59));
4039 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.34413509f79ffp-2));
4040 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4041 F =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4043 auto C4 =
B.buildFConstant(
S64,
APFloat(0x1.26bb1bbb55516p+1));
4044 auto C5 =
B.buildFConstant(
S64,
APFloat(-0x1.f48ad494ea3e9p-53));
4045 auto MulF =
B.buildFMul(
S64,
F, C5, Flags).getReg(0);
4046 T =
B.buildFMA(
S64,
F, C4, MulF, Flags).getReg(0);
4049 auto C1 =
B.buildFConstant(
S64,
APFloat(0x1.71547652b82fep+0));
4050 auto Mul =
B.buildFMul(
S64,
X, C1, Flags).getReg(0);
4051 Dn =
B.buildFRint(
S64,
Mul, Flags).getReg(0);
4053 auto NegDn =
B.buildFNeg(
S64, Dn, Flags).getReg(0);
4054 auto C2 =
B.buildFConstant(
S64,
APFloat(0x1.abc9e3b39803fp-56));
4055 auto C3 =
B.buildFConstant(
S64,
APFloat(0x1.62e42fefa39efp-1));
4056 auto Inner =
B.buildFMA(
S64, NegDn, C3,
X, Flags).getReg(0);
4057 T =
B.buildFMA(
S64, NegDn, C2, Inner, Flags).getReg(0);
4061 auto P =
B.buildFConstant(
S64, 0x1.ade156a5dcb37p-26);
4062 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.28af3fca7ab0cp-22),
4064 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.71dee623fde64p-19),
4066 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01997c89e6b0p-16),
4068 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.a01a014761f6ep-13),
4070 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.6c16c1852b7b0p-10),
4072 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.1111111122322p-7), Flags);
4073 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.55555555502a1p-5), Flags);
4074 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.5555555555511p-3), Flags);
4075 P =
B.buildFMA(
S64,
T,
P,
B.buildFConstant(
S64, 0x1.000000000000bp-1), Flags);
4077 auto One =
B.buildFConstant(
S64, 1.0);
4078 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4079 P =
B.buildFMA(
S64,
T,
P, One, Flags);
4082 auto DnInt =
B.buildFPTOSI(
S32, Dn);
4083 auto Z =
B.buildFLdexp(
S64,
P, DnInt, Flags);
4090 Z =
B.buildSelect(
S64, CondHi, Z, PInf, Flags);
4097 B.buildSelect(
MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4099 MI.eraseFromParent();
4107 const unsigned Flags =
MI.getFlags();
4119 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
4127 MI.eraseFromParent();
4138 auto Ext =
B.buildFPExt(
F32,
X, Flags);
4141 B.buildFPTrunc(Dst, Lowered, Flags);
4142 MI.eraseFromParent();
4153 MI.eraseFromParent();
4181 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
4184 if (ST.hasFastFMAF32()) {
4186 const float cc_exp = 0x1.4ae0bep-26f;
4187 const float c_exp10 = 0x1.a934f0p+1f;
4188 const float cc_exp10 = 0x1.2f346ep-24f;
4190 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4191 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
4192 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
4193 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
4195 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4196 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
4198 const float ch_exp = 0x1.714000p+0f;
4199 const float cl_exp = 0x1.47652ap-12f;
4201 const float ch_exp10 = 0x1.a92000p+1f;
4202 const float cl_exp10 = 0x1.4f0978p-11f;
4204 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
4205 auto XH =
B.buildAnd(Ty,
X, MaskConst);
4206 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
4208 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4209 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
4211 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4212 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
4215 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
4216 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4219 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
4222 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
4223 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
4226 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4227 .addUse(
A.getReg(0))
4229 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
4231 auto UnderflowCheckConst =
4232 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4233 auto Zero =
B.buildFConstant(Ty, 0.0);
4237 R =
B.buildSelect(Ty, Underflow, Zero, R);
4240 auto OverflowCheckConst =
4241 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4246 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
4249 B.buildCopy(Dst, R);
4250 MI.eraseFromParent();
4259 unsigned Flags =
MI.getFlags();
4260 LLT Ty =
B.getMRI()->getType(Dst);
4265 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4266 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4267 .addUse(Log.getReg(0))
4270 B.buildFExp2(Dst,
Mul, Flags);
4271 }
else if (Ty == F16) {
4273 auto Log =
B.buildFLog2(F16, Src0, Flags);
4274 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4275 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4276 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4277 .addUse(Ext0.getReg(0))
4278 .addUse(Ext1.getReg(0))
4280 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4284 MI.eraseFromParent();
4292 ModSrc = SrcFNeg->getOperand(1).getReg();
4294 ModSrc = SrcFAbs->getOperand(1).getReg();
4296 ModSrc = SrcFAbs->getOperand(1).getReg();
4307 Register OrigSrc =
MI.getOperand(1).getReg();
4308 unsigned Flags =
MI.getFlags();
4310 "this should not have been custom lowered");
4320 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4340 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4342 B.buildFMinNum(Min, Fract, Const, Flags);
4347 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4350 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4351 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4353 MI.eraseFromParent();
4369 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4371 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4372 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4375 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4376 B.buildBitcast(Dst,
Merge);
4378 MI.eraseFromParent();
4395 bool UsePartialMad64_32,
4396 bool SeparateOddAlignedProducts)
const {
4411 auto getZero32 = [&]() ->
Register {
4413 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4416 auto getZero64 = [&]() ->
Register {
4418 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4423 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4434 if (CarryIn.empty())
4437 bool HaveCarryOut =
true;
4439 if (CarryIn.size() == 1) {
4441 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4445 CarryAccum = getZero32();
4447 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4448 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4450 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4455 LocalAccum = getZero32();
4456 HaveCarryOut =
false;
4461 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4462 LocalAccum =
Add.getReg(0);
4476 auto buildMadChain =
4479 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4480 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4487 if (LocalAccum.size() == 1 &&
4488 (!UsePartialMad64_32 || !CarryIn.empty())) {
4491 unsigned j1 = DstIndex - j0;
4492 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4496 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4498 LocalAccum[0] =
Mul.getReg(0);
4500 if (CarryIn.empty()) {
4501 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4504 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4510 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4514 if (j0 <= DstIndex) {
4515 bool HaveSmallAccum =
false;
4518 if (LocalAccum[0]) {
4519 if (LocalAccum.size() == 1) {
4520 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4521 HaveSmallAccum =
true;
4522 }
else if (LocalAccum[1]) {
4523 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4524 HaveSmallAccum =
false;
4526 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4527 HaveSmallAccum =
true;
4530 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4532 HaveSmallAccum =
true;
4536 unsigned j1 = DstIndex - j0;
4537 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4541 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4542 {Src0[j0], Src1[j1], Tmp});
4543 Tmp = Mad.getReg(0);
4544 if (!HaveSmallAccum)
4545 CarryOut.push_back(Mad.getReg(1));
4546 HaveSmallAccum =
false;
4549 }
while (j0 <= DstIndex);
4551 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4552 LocalAccum[0] = Unmerge.getReg(0);
4553 if (LocalAccum.size() > 1)
4554 LocalAccum[1] = Unmerge.getReg(1);
4581 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4582 Carry OddCarryIn = std::move(OddCarry);
4583 Carry EvenCarryIn = std::move(EvenCarry);
4588 if (2 * i < Accum.
size()) {
4589 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4590 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4595 if (!SeparateOddAlignedProducts) {
4596 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4597 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4599 bool IsHighest = 2 * i >= Accum.
size();
4602 .take_front(IsHighest ? 1 : 2);
4603 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4609 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4611 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4613 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4616 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4619 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4620 Lo->getOperand(1).getReg());
4621 Accum[2 * i] =
Hi.getReg(0);
4622 SeparateOddCarry =
Hi.getReg(1);
4629 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4630 EvenCarryIn.push_back(CarryOut);
4632 if (2 * i < Accum.
size()) {
4633 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4634 OddCarry.push_back(CarryOut);
4646 assert(ST.hasMad64_32());
4647 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4659 unsigned Size = Ty.getSizeInBits();
4660 if (ST.hasVMulU64Inst() &&
Size == 64)
4663 unsigned NumParts =
Size / 32;
4675 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4679 for (
unsigned i = 0; i < NumParts; ++i) {
4683 B.buildUnmerge(Src0Parts, Src0);
4684 B.buildUnmerge(Src1Parts, Src1);
4687 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4688 SeparateOddAlignedProducts);
4690 B.buildMergeLikeInstr(DstReg, AccumRegs);
4691 MI.eraseFromParent();
4706 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4707 ? AMDGPU::G_AMDGPU_FFBH_U32
4708 : AMDGPU::G_AMDGPU_FFBL_B32;
4709 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4712 MI.eraseFromParent();
4722 TypeSize NumBits = SrcTy.getSizeInBits();
4726 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4727 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4728 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4729 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4730 B.buildTrunc(Dst, Ctlz);
4731 MI.eraseFromParent();
4742 assert(SrcTy ==
S32 &&
"legalizeCTLS only supports s32");
4743 unsigned BitWidth = SrcTy.getSizeInBits();
4745 auto Sffbh =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32}).addUse(Src);
4747 B.buildSub(Dst, Clamped,
B.buildConstant(
S32, 1));
4748 MI.eraseFromParent();
4754 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4757 return ConstVal == -1;
4764 Register CondDef =
MI.getOperand(0).getReg();
4783 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4792 UncondBrTarget = &*NextMBB;
4794 if (
Next->getOpcode() != AMDGPU::G_BR)
4813 *ArgRC,
B.getDebugLoc(), ArgTy);
4817 const unsigned Mask = Arg->
getMask();
4825 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4826 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4829 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4831 B.buildCopy(DstReg, LiveIn);
4841 if (!ST.hasClusters()) {
4844 MI.eraseFromParent();
4864 auto One =
B.buildConstant(
S32, 1);
4865 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4866 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4867 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4874 B.buildCopy(DstReg, GlobalIdXYZ);
4875 MI.eraseFromParent();
4879 B.buildCopy(DstReg, ClusterIdXYZ);
4880 MI.eraseFromParent();
4885 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4887 MRI.
setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4888 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4890 .addImm(ClusterIdField);
4891 auto Zero =
B.buildConstant(
S32, 0);
4894 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4895 MI.eraseFromParent();
4937 auto LoadConstant = [&](
unsigned N) {
4938 B.buildConstant(DstReg,
N);
4942 if (ST.hasArchitectedSGPRs() &&
4949 Arg = &WorkGroupIDX;
4950 ArgRC = &AMDGPU::SReg_32RegClass;
4954 Arg = &WorkGroupIDY;
4955 ArgRC = &AMDGPU::SReg_32RegClass;
4959 Arg = &WorkGroupIDZ;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4964 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4965 return LoadConstant(0);
4966 Arg = &ClusterWorkGroupIDX;
4967 ArgRC = &AMDGPU::SReg_32RegClass;
4971 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4972 return LoadConstant(0);
4973 Arg = &ClusterWorkGroupIDY;
4974 ArgRC = &AMDGPU::SReg_32RegClass;
4978 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4979 return LoadConstant(0);
4980 Arg = &ClusterWorkGroupIDZ;
4981 ArgRC = &AMDGPU::SReg_32RegClass;
4986 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4987 Arg = &ClusterWorkGroupMaxIDX;
4988 ArgRC = &AMDGPU::SReg_32RegClass;
4993 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4994 Arg = &ClusterWorkGroupMaxIDY;
4995 ArgRC = &AMDGPU::SReg_32RegClass;
5000 return LoadConstant(ClusterDims.
getDims()[2] - 1);
5001 Arg = &ClusterWorkGroupMaxIDZ;
5002 ArgRC = &AMDGPU::SReg_32RegClass;
5006 Arg = &ClusterWorkGroupMaxFlatID;
5007 ArgRC = &AMDGPU::SReg_32RegClass;
5022 return LoadConstant(0);
5027 B.buildUndef(DstReg);
5031 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5043 MI.eraseFromParent();
5049 B.buildConstant(
MI.getOperand(0).getReg(),
C);
5050 MI.eraseFromParent();
5057 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
5071 B.buildUndef(DstReg);
5072 MI.eraseFromParent();
5076 if (Arg->isMasked()) {
5090 MI.eraseFromParent();
5105 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
5114 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5122 Align Alignment)
const {
5126 "unexpected kernarg parameter type");
5133 MI.eraseFromParent();
5168 auto FloatY =
B.buildUITOFP(
S32,
Y);
5169 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
5171 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
5172 auto Z =
B.buildFPTOUI(
S32, ScaledY);
5175 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
5176 auto NegYZ =
B.buildMul(
S32, NegY, Z);
5177 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
5180 auto Q =
B.buildUMulH(
S32,
X, Z);
5181 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
5184 auto One =
B.buildConstant(
S32, 1);
5187 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
5193 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
5196 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
5215 auto Unmerge =
B.buildUnmerge(
S32, Val);
5217 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
5218 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
5220 auto Mad =
B.buildFMAD(
5224 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
5225 auto Mul1 =
B.buildFMul(
5229 auto Mul2 =
B.buildFMul(
5231 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
5234 auto Mad2 =
B.buildFMAD(
5238 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
5239 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
5241 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5256 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
5258 auto Zero64 =
B.buildConstant(
S64, 0);
5259 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
5261 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
5262 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
5264 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
5265 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5266 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5268 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
5269 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5270 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
5272 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
5273 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
5274 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5275 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5276 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5278 auto Zero32 =
B.buildConstant(
S32, 0);
5279 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5280 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5281 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5283 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5284 Register NumerLo = UnmergeNumer.getReg(0);
5285 Register NumerHi = UnmergeNumer.getReg(1);
5287 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5288 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5289 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5290 Register Mul3_Lo = UnmergeMul3.getReg(0);
5291 Register Mul3_Hi = UnmergeMul3.getReg(1);
5292 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5293 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5294 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5295 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5297 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5298 Register DenomLo = UnmergeDenom.getReg(0);
5299 Register DenomHi = UnmergeDenom.getReg(1);
5302 auto C1 =
B.buildSExt(
S32, CmpHi);
5305 auto C2 =
B.buildSExt(
S32, CmpLo);
5308 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5315 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5316 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5317 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5318 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5320 auto One64 =
B.buildConstant(
S64, 1);
5321 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5327 auto C6 =
B.buildSelect(
5331 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5332 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5334 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5335 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5336 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5342 auto Sel1 =
B.buildSelect(
5349 auto Sel2 =
B.buildSelect(
5360 switch (
MI.getOpcode()) {
5363 case AMDGPU::G_UDIV: {
5364 DstDivReg =
MI.getOperand(0).getReg();
5367 case AMDGPU::G_UREM: {
5368 DstRemReg =
MI.getOperand(0).getReg();
5371 case AMDGPU::G_UDIVREM: {
5372 DstDivReg =
MI.getOperand(0).getReg();
5373 DstRemReg =
MI.getOperand(1).getReg();
5380 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5381 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5382 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5392 MI.eraseFromParent();
5403 if (Ty !=
S32 && Ty !=
S64)
5406 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5407 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5408 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5410 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5411 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5412 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5414 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5415 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5417 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5418 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5420 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5421 switch (
MI.getOpcode()) {
5424 case AMDGPU::G_SDIV: {
5425 DstDivReg =
MI.getOperand(0).getReg();
5429 case AMDGPU::G_SREM: {
5430 DstRemReg =
MI.getOperand(0).getReg();
5434 case AMDGPU::G_SDIVREM: {
5435 DstDivReg =
MI.getOperand(0).getReg();
5436 DstRemReg =
MI.getOperand(1).getReg();
5449 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5450 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5451 B.buildSub(DstDivReg, SignXor, Sign);
5455 auto Sign = LHSign.getReg(0);
5456 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5457 B.buildSub(DstRemReg, SignXor, Sign);
5460 MI.eraseFromParent();
5476 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5487 if (CLHS->isExactlyValue(1.0)) {
5488 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5492 MI.eraseFromParent();
5497 if (CLHS->isExactlyValue(-1.0)) {
5498 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5499 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5500 .addUse(FNeg.getReg(0))
5503 MI.eraseFromParent();
5510 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5515 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5518 B.buildFMul(Res, LHS, RCP, Flags);
5520 MI.eraseFromParent();
5535 if (!AllowInaccurateRcp)
5543 X =
B.buildFConstant(ResTy, 1.0).getReg(0);
5545 Register NegY = IsNegRcp ?
Y :
B.buildFNeg(ResTy,
Y).getReg(0);
5546 auto One =
B.buildFConstant(ResTy, 1.0);
5548 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5552 R =
B.buildFNeg(ResTy, R);
5554 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5555 R =
B.buildFMA(ResTy, Tmp0, R, R);
5557 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5558 R =
B.buildFMA(ResTy, Tmp1, R, R);
5562 B.buildCopy(Res, R);
5563 MI.eraseFromParent();
5567 auto Ret =
B.buildFMul(ResTy,
X, R);
5568 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5570 B.buildFMA(Res, Tmp2, R, Ret);
5571 MI.eraseFromParent();
5603 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5604 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5605 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5606 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5607 .addUse(RHSExt.getReg(0))
5609 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5611 if (ST.hasMadMacF32Insts()) {
5612 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5613 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5614 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5616 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5617 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5618 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5620 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5621 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5622 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5623 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5624 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5625 .addUse(RDst.getReg(0))
5630 MI.eraseFromParent();
5643 unsigned SPDenormMode =
5646 if (ST.hasDenormModeInst()) {
5648 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5650 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5651 B.buildInstr(AMDGPU::S_DENORM_MODE)
5652 .addImm(NewDenormModeValue);
5655 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5656 .addImm(SPDenormMode)
5678 auto One =
B.buildFConstant(
S32, 1.0f);
5680 auto DenominatorScaled =
5681 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5686 auto NumeratorScaled =
5687 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5693 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5694 .addUse(DenominatorScaled.getReg(0))
5696 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5699 const bool HasDynamicDenormals =
5704 if (!PreservesDenormals) {
5705 if (HasDynamicDenormals) {
5707 B.buildInstr(AMDGPU::S_GETREG_B32)
5708 .addDef(SavedSPDenormMode)
5714 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5715 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5716 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5717 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5718 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5719 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5721 if (!PreservesDenormals) {
5722 if (HasDynamicDenormals) {
5723 assert(SavedSPDenormMode);
5724 B.buildInstr(AMDGPU::S_SETREG_B32)
5725 .addReg(SavedSPDenormMode)
5731 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5732 .addUse(Fma4.getReg(0))
5733 .addUse(Fma1.getReg(0))
5734 .addUse(Fma3.getReg(0))
5735 .addUse(NumeratorScaled.getReg(1))
5738 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5739 .addUse(Fmas.getReg(0))
5744 MI.eraseFromParent();
5763 auto One =
B.buildFConstant(
S64, 1.0);
5765 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5771 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5773 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5774 .addUse(DivScale0.getReg(0))
5777 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5778 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5779 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5781 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5787 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5788 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5789 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5792 if (!ST.hasUsableDivScaleConditionOutput()) {
5798 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5799 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5800 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5801 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5804 Scale1Unmerge.getReg(1));
5806 Scale0Unmerge.getReg(1));
5807 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5809 Scale = DivScale1.getReg(1);
5812 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5813 .addUse(Fma4.getReg(0))
5814 .addUse(Fma3.getReg(0))
5815 .addUse(
Mul.getReg(0))
5819 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5820 .addUse(Fmas.getReg(0))
5825 MI.eraseFromParent();
5840 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5843 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5847 if (ST.hasFractBug()) {
5848 auto Fabs =
B.buildFAbs(Ty, Val);
5852 auto Zero =
B.buildConstant(InstrExpTy, 0);
5853 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5854 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5857 B.buildCopy(Res0, Mant);
5858 B.buildSExtOrTrunc(Res1, Exp);
5860 MI.eraseFromParent();
5875 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5878 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5879 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5880 auto C2 =
B.buildFConstant(
S32, 1.0f);
5883 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5885 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5887 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5888 .addUse(Mul0.getReg(0))
5891 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5893 B.buildFMul(Res, Sel, Mul1, Flags);
5895 MI.eraseFromParent();
5904 unsigned Flags =
MI.getFlags();
5905 assert(!ST.has16BitInsts());
5907 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5908 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5909 .addUse(Ext.getReg(0))
5911 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5912 MI.eraseFromParent();
5922 const unsigned Flags =
MI.getFlags();
5931 MI.eraseFromParent();
5935 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5937 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5938 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5939 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5944 .addUse(SqrtX.getReg(0))
5947 auto NegOne =
B.buildConstant(I32, -1);
5948 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5950 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5951 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5953 auto PosOne =
B.buildConstant(I32, 1);
5954 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5956 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5957 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5959 auto Zero =
B.buildFConstant(
F32, 0.0f);
5963 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5967 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5970 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5971 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5973 auto Half =
B.buildFConstant(
F32, 0.5f);
5974 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5975 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5976 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5977 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5978 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5979 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5980 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5981 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5984 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5986 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5988 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5991 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5993 MI.eraseFromParent();
6028 unsigned Flags =
MI.getFlags();
6033 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
6035 ZeroInt =
B.buildConstant(
S32, 0).getReg(0);
6039 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
6040 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
6041 SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags).getReg(0);
6044 auto SqrtY =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX);
6046 auto Half =
B.buildFConstant(
F64, 0.5);
6047 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
6048 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
6050 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
6051 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
6053 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
6054 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
6056 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
6057 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
6059 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
6061 Register SqrtRet = SqrtS2.getReg(0);
6063 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
6064 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
6065 auto SqrtD2 =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
6068 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
6069 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
6070 SqrtRet =
B.buildFLdexp(
F64, SqrtD2, ScaleDown, Flags).getReg(0);
6075 auto ZeroFP =
B.buildFConstant(
F64, 0.0);
6084 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6086 MI.eraseFromParent();
6117 auto Flags =
MI.getFlags();
6129 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6139 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6140 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6145 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6147 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6148 MI.eraseFromParent();
6160 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6161 IID == Intrinsic::amdgcn_permlanex16;
6162 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6163 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6164 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6165 IID == Intrinsic::amdgcn_permlane_up ||
6166 IID == Intrinsic::amdgcn_permlane_down ||
6167 IID == Intrinsic::amdgcn_permlane_xor;
6171 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
6173 case Intrinsic::amdgcn_readfirstlane:
6174 case Intrinsic::amdgcn_permlane64:
6175 return LaneOp.getReg(0);
6176 case Intrinsic::amdgcn_readlane:
6177 case Intrinsic::amdgcn_set_inactive:
6178 case Intrinsic::amdgcn_set_inactive_chain_arg:
6179 return LaneOp.addUse(Src1).getReg(0);
6180 case Intrinsic::amdgcn_writelane:
6181 case Intrinsic::amdgcn_permlane_bcast:
6182 case Intrinsic::amdgcn_permlane_up:
6183 case Intrinsic::amdgcn_permlane_down:
6184 case Intrinsic::amdgcn_permlane_xor:
6185 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6186 case Intrinsic::amdgcn_permlane16:
6187 case Intrinsic::amdgcn_permlanex16: {
6189 int64_t Src4 =
MI.getOperand(6).getImm();
6190 int64_t Src5 =
MI.getOperand(7).getImm();
6191 return LaneOp.addUse(Src1)
6198 case Intrinsic::amdgcn_mov_dpp8:
6199 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
6200 case Intrinsic::amdgcn_update_dpp:
6201 return LaneOp.addUse(Src1)
6202 .addImm(
MI.getOperand(4).getImm())
6203 .addImm(
MI.getOperand(5).getImm())
6204 .addImm(
MI.getOperand(6).getImm())
6205 .addImm(
MI.getOperand(7).getImm())
6215 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6216 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6217 IsPermlaneShuffle) {
6218 Src1 =
MI.getOperand(3).getReg();
6219 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6220 IsPermlaneShuffle) {
6221 Src2 =
MI.getOperand(4).getReg();
6226 unsigned Size = Ty.getSizeInBits();
6228 unsigned SplitSize = 32;
6229 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
6230 ST.hasDPALU_DPP() &&
6234 if (
Size == SplitSize) {
6240 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
6242 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6245 if (IID == Intrinsic::amdgcn_writelane)
6248 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
6249 B.buildTrunc(DstReg, LaneOpDst);
6250 MI.eraseFromParent();
6254 if (
Size % SplitSize != 0)
6258 bool NeedsBitcast =
false;
6259 if (Ty.isVector()) {
6262 if (EltSize == SplitSize) {
6263 PartialResTy = EltTy;
6264 }
else if (EltSize == 16 || EltSize == 32) {
6265 unsigned NElem = SplitSize / EltSize;
6269 NeedsBitcast =
true;
6274 unsigned NumParts =
Size / SplitSize;
6278 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6279 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
6281 if (IID == Intrinsic::amdgcn_writelane)
6282 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
6284 for (
unsigned i = 0; i < NumParts; ++i) {
6285 Src0 = Src0Parts.
getReg(i);
6287 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6288 Src1 = Src1Parts.
getReg(i);
6290 if (IID == Intrinsic::amdgcn_writelane)
6291 Src2 = Src2Parts.
getReg(i);
6293 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6297 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
6300 B.buildMergeLikeInstr(DstReg, PartialRes);
6302 MI.eraseFromParent();
6310 ST.getTargetLowering()->getImplicitParameterOffset(
6320 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6321 B.buildConstant(IdxTy,
Offset).getReg(0));
6332 Register Pointer =
MI.getOperand(2).getReg();
6334 Register NumRecords =
MI.getOperand(4).getReg();
6340 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6342 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6344 if (ST.has45BitNumRecordsBufferResource()) {
6349 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6350 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6351 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6352 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6356 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6357 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6358 auto ExtShiftedStride =
6359 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6360 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6361 auto ExtShiftedFlags =
6362 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6363 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6365 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6366 B.buildMergeValues(Result, {LowHalf, HighHalf});
6368 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6369 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6370 auto LowHalf = Unmerge.getReg(0);
6371 auto HighHalf = Unmerge.getReg(1);
6373 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6374 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6375 auto ShiftConst =
B.buildConstant(
S32, 16);
6376 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6377 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6378 Register NewHighHalfReg = NewHighHalf.getReg(0);
6379 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6382 MI.eraseFromParent();
6399 MI.eraseFromParent();
6407 std::optional<uint32_t> KnownSize =
6409 if (KnownSize.has_value())
6410 B.buildConstant(DstReg, *KnownSize);
6428 MI.eraseFromParent();
6435 unsigned AddrSpace)
const {
6437 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6441 ST.hasGloballyAddressableScratch()) {
6443 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6444 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6446 MRI.
setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6448 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6450 B.buildConstant(
S32, 1u << 26));
6455 MI.eraseFromParent();
6465std::pair<Register, unsigned>
6477 bool CheckNUW = ST.hasGFX1250Insts();
6479 MRI, OrigOffset,
nullptr, CheckNUW);
6483 BaseReg =
B.buildPtrToInt(MRI.
getType(OrigOffset), BaseReg).getReg(0);
6493 unsigned Overflow = ImmOffset & ~MaxImm;
6494 ImmOffset -= Overflow;
6495 if ((int32_t)Overflow < 0) {
6496 Overflow += ImmOffset;
6500 if (Overflow != 0) {
6502 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6504 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6505 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6510 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6512 return std::pair(BaseReg, ImmOffset);
6519 bool ImageStore)
const {
6525 if (ST.hasUnpackedD16VMem()) {
6526 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6529 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6530 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6538 if (ImageStore && ST.hasImageStoreD16Bug()) {
6541 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6543 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6550 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6551 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6553 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6561 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6562 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6564 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6581 bool IsFormat)
const {
6593 VData =
B.buildBitcast(Ty, VData).getReg(0);
6601 if (Ty.isVector()) {
6602 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6614 bool IsFormat)
const {
6621 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6636 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6639 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6643 VIndex =
MI.getOperand(3).getReg();
6646 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6649 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6650 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6654 Format =
MI.getOperand(5 + OpOffset).getImm();
6658 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6664 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6665 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6666 }
else if (IsFormat) {
6667 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6668 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6672 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6675 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6678 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6683 auto MIB =
B.buildInstr(
Opc)
6694 MIB.addImm(AuxiliaryData)
6695 .addImm(HasVIndex ? -1 : 0)
6696 .addMemOperand(MMO);
6698 MI.eraseFromParent();
6704 unsigned ImmOffset,
unsigned Format,
6707 auto MIB =
B.buildInstr(
Opc)
6718 MIB.addImm(AuxiliaryData)
6719 .addImm(HasVIndex ? -1 : 0)
6720 .addMemOperand(MMO);
6726 bool IsTyped)
const {
6740 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6741 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6743 StatusDst =
MI.getOperand(1).getReg();
6748 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6751 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6754 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6757 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6760 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6763 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6764 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6768 Format =
MI.getOperand(5 + OpOffset).getImm();
6772 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6782 Dst =
MI.getOperand(0).getReg();
6783 B.setInsertPt(
B.getMBB(),
MI);
6790 Dst =
MI.getOperand(0).getReg();
6791 B.setInsertPt(
B.getMBB(),
MI);
6795 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6796 const bool Unpacked = ST.hasUnpackedD16VMem();
6806 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6807 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6808 }
else if (IsFormat) {
6812 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6814 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6815 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6820 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6821 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6824 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6825 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6828 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6829 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6835 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6836 unsigned NumLoadDWords = NumValueDWords + 1;
6838 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6840 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6842 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6843 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6844 B.buildTrunc(Dst, ExtDst);
6845 }
else if (NumValueDWords == 1) {
6846 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6849 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6850 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6852 B.buildUnmerge(LoadElts, LoadDstReg);
6854 B.buildMergeLikeInstr(Dst, LoadElts);
6857 (IsD16 && !Ty.isVector())) {
6858 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6860 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6861 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6862 B.buildTrunc(Dst, LoadDstReg);
6863 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6865 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6867 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6868 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6870 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6872 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6873 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6874 B.buildMergeLikeInstr(Dst, Repack);
6877 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6880 MI.eraseFromParent();
6886 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6888 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6891 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6961 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6963 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6966 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6968 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6971 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6973 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6984 const bool IsCmpSwap =
6985 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6986 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6987 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6988 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6999 CmpVal =
MI.getOperand(3).getReg();
7004 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
7005 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
7008 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
7011 VIndex =
MI.getOperand(4 + OpOffset).getReg();
7014 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
7017 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
7018 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
7019 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
7038 .addImm(AuxiliaryData)
7039 .addImm(HasVIndex ? -1 : 0)
7040 .addMemOperand(MMO);
7042 MI.eraseFromParent();
7052 bool IsA16,
bool IsG16) {
7068 (
B.getMRI()->getType(AddrReg) ==
S16)) {
7073 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7077 "Bias needs to be converted to 16 bit in A16 mode");
7079 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
7085 if (((
I + 1) >= EndIdx) ||
7092 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
7094 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7099 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7110 int DimIdx,
int NumVAddrs) {
7114 for (
int I = 0;
I != NumVAddrs; ++
I) {
7116 if (
SrcOp.isReg()) {
7122 int NumAddrRegs = AddrRegs.
size();
7123 if (NumAddrRegs != 1) {
7126 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7129 for (
int I = 1;
I != NumVAddrs; ++
I) {
7132 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
7154 const unsigned NumDefs =
MI.getNumExplicitDefs();
7155 const unsigned ArgOffset = NumDefs + 1;
7156 bool IsTFE = NumDefs == 2;
7174 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7178 const bool IsAtomicPacked16Bit =
7179 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7180 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7188 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
7189 const bool IsA16 = AddrTy ==
S16;
7190 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
7193 if (!BaseOpcode->
Atomic) {
7194 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
7197 }
else if (DMask != 0) {
7199 }
else if (!IsTFE && !BaseOpcode->
Store) {
7201 B.buildUndef(
MI.getOperand(0));
7202 MI.eraseFromParent();
7210 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7211 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7212 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7213 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7214 unsigned NewOpcode = LoadOpcode;
7215 if (BaseOpcode->
Store)
7216 NewOpcode = StoreOpcode;
7218 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7221 MI.setDesc(
B.getTII().get(NewOpcode));
7225 if (IsTFE && DMask == 0) {
7228 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
7231 if (BaseOpcode->
Atomic) {
7236 if (Ty.isVector() && !IsAtomicPacked16Bit)
7243 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
7244 MI.getOperand(2).setReg(
Concat.getReg(0));
7245 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7249 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
7252 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7258 if (IsA16 && !ST.hasA16()) {
7263 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
7264 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7266 if (IsA16 || IsG16) {
7274 const bool UseNSA = ST.hasNSAEncoding() &&
7275 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
7276 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
7277 const bool UsePartialNSA =
7278 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
7280 if (UsePartialNSA) {
7284 auto Concat =
B.buildConcatVectors(
7285 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7286 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
7287 PackedRegs.
resize(NSAMaxSize);
7288 }
else if (!UseNSA && PackedRegs.
size() > 1) {
7290 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
7291 PackedRegs[0] =
Concat.getReg(0);
7295 const unsigned NumPacked = PackedRegs.
size();
7298 if (!
SrcOp.isReg()) {
7308 SrcOp.setReg(AMDGPU::NoRegister);
7325 const bool UseNSA = ST.hasNSAEncoding() &&
7326 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7327 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7328 const bool UsePartialNSA =
7329 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7331 if (UsePartialNSA) {
7333 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7335 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7350 if (!Ty.isVector() || !IsD16)
7354 if (RepackedReg != VData) {
7355 MI.getOperand(1).setReg(RepackedReg);
7363 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7366 if (NumElts < DMaskLanes)
7369 if (NumElts > 4 || DMaskLanes > 4)
7379 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7380 const LLT AdjustedTy =
7396 if (IsD16 && ST.hasUnpackedD16VMem()) {
7403 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7404 unsigned RoundedSize = 32 * RoundedElts;
7408 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7413 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7419 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7423 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7424 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7428 MI.getOperand(0).setReg(NewResultReg);
7436 Dst1Reg =
MI.getOperand(1).getReg();
7441 MI.removeOperand(1);
7445 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7454 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7456 if (ResultNumRegs == 1) {
7458 ResultRegs[0] = NewResultReg;
7461 for (
int I = 0;
I != NumDataRegs; ++
I)
7463 B.buildUnmerge(ResultRegs, NewResultReg);
7468 ResultRegs.
resize(NumDataRegs);
7473 if (IsD16 && !Ty.isVector()) {
7474 B.buildTrunc(DstReg, ResultRegs[0]);
7479 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7480 B.buildBitcast(DstReg, ResultRegs[0]);
7492 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7494 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7495 }
else if (ST.hasUnpackedD16VMem()) {
7497 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7501 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7505 for (
int I = 0;
I != NumElts; ++
I)
7512 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7513 B.buildBuildVector(DstReg, ResultRegs);
7517 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7518 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7524 if (ResultRegs.
size() == 1) {
7525 NewResultReg = ResultRegs[0];
7526 }
else if (ResultRegs.
size() == 2) {
7528 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7536 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7538 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7543 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7544 B.buildConcatVectors(DstReg, ResultRegs);
7553 Register OrigDst =
MI.getOperand(0).getReg();
7555 LLT Ty =
B.getMRI()->getType(OrigDst);
7556 unsigned Size = Ty.getSizeInBits();
7559 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7561 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7562 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7565 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7567 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7576 B.setInsertPt(
B.getMBB(),
MI);
7581 B.setInsertPt(
B.getMBB(),
MI);
7587 MI.setDesc(
B.getTII().get(
Opc));
7588 MI.removeOperand(1);
7591 const unsigned MemSize = (
Size + 7) / 8;
7592 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7599 MI.addMemOperand(MF, MMO);
7600 if (Dst != OrigDst) {
7601 MI.getOperand(0).setReg(Dst);
7602 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7603 B.buildTrunc(OrigDst, Dst);
7625 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7626 MI.removeOperand(0);
7636 if (!ST.hasTrapHandler() ||
7640 return ST.supportsGetDoorbellID() ?
7653 MI.eraseFromParent();
7663 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7665 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7669 MI.eraseFromParent();
7678 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7685 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7705 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7708 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7709 B.buildCopy(SGPR01, Temp);
7710 B.buildInstr(AMDGPU::S_TRAP)
7713 MI.eraseFromParent();
7724 B.buildCopy(SGPR01, LiveIn);
7725 B.buildInstr(AMDGPU::S_TRAP)
7729 MI.eraseFromParent();
7738 if (ST.hasPrivEnabledTrap2NopBug()) {
7739 ST.getInstrInfo()->insertSimulatedTrap(MRI,
B.getMBB(),
MI,
7741 MI.eraseFromParent();
7745 B.buildInstr(AMDGPU::S_TRAP)
7747 MI.eraseFromParent();
7756 if (!ST.hasTrapHandler() ||
7760 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7763 B.buildInstr(AMDGPU::S_TRAP)
7767 MI.eraseFromParent();
7780 Register NodePtr =
MI.getOperand(2).getReg();
7781 Register RayExtent =
MI.getOperand(3).getReg();
7782 Register RayOrigin =
MI.getOperand(4).getReg();
7784 Register RayInvDir =
MI.getOperand(6).getReg();
7787 if (!ST.hasGFX10_AEncoding()) {
7790 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7799 const unsigned NumVDataDwords = 4;
7800 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7801 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7803 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7805 const unsigned BaseOpcodes[2][2] = {
7806 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7807 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7808 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7812 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7813 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7814 : AMDGPU::MIMGEncGfx10NSA,
7815 NumVDataDwords, NumVAddrDwords);
7819 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7820 : AMDGPU::MIMGEncGfx10Default,
7821 NumVDataDwords, NumVAddrDwords);
7826 if (UseNSA && IsGFX11Plus) {
7828 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7829 auto Merged =
B.buildMergeLikeInstr(
7830 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7831 Ops.push_back(Merged.getReg(0));
7834 Ops.push_back(NodePtr);
7835 Ops.push_back(RayExtent);
7836 packLanes(RayOrigin);
7839 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7840 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7841 auto MergedDir =
B.buildMergeLikeInstr(
7844 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7845 UnmergeRayDir.getReg(0)}))
7848 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7849 UnmergeRayDir.getReg(1)}))
7852 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7853 UnmergeRayDir.getReg(2)}))
7855 Ops.push_back(MergedDir.getReg(0));
7858 packLanes(RayInvDir);
7862 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7863 Ops.push_back(Unmerge.getReg(0));
7864 Ops.push_back(Unmerge.getReg(1));
7866 Ops.push_back(NodePtr);
7868 Ops.push_back(RayExtent);
7871 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7872 Ops.push_back(Unmerge.getReg(0));
7873 Ops.push_back(Unmerge.getReg(1));
7874 Ops.push_back(Unmerge.getReg(2));
7877 packLanes(RayOrigin);
7879 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7880 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7884 B.buildMergeLikeInstr(R1,
7885 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7886 B.buildMergeLikeInstr(
7887 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7888 B.buildMergeLikeInstr(
7889 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7895 packLanes(RayInvDir);
7902 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7904 Ops.push_back(MergedOps);
7907 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7916 .addImm(IsA16 ? 1 : 0)
7919 MI.eraseFromParent();
7929 Register DstOrigin =
MI.getOperand(1).getReg();
7931 Register NodePtr =
MI.getOperand(4).getReg();
7932 Register RayExtent =
MI.getOperand(5).getReg();
7933 Register InstanceMask =
MI.getOperand(6).getReg();
7934 Register RayOrigin =
MI.getOperand(7).getReg();
7936 Register Offsets =
MI.getOperand(9).getReg();
7937 Register TDescr =
MI.getOperand(10).getReg();
7939 if (!ST.hasBVHDualAndBVH8Insts()) {
7942 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7947 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7948 const unsigned NumVDataDwords = 10;
7949 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7951 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7952 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7953 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7956 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7957 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7959 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7960 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7966 .addUse(RayExtentInstanceMaskVec.getReg(0))
7973 MI.eraseFromParent();
7982 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7983 MI.eraseFromParent();
7990 if (!ST.hasArchitectedSGPRs())
7994 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7995 auto LSB =
B.buildConstant(
S32, 25);
7996 auto Width =
B.buildConstant(
S32, 5);
7997 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7998 MI.eraseFromParent();
8006 unsigned Width)
const {
8010 MRI.
setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
8011 B.buildInstr(AMDGPU::S_GETREG_B32_const)
8014 MI.eraseFromParent();
8032 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8036 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
8039 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8040 MI.eraseFromParent();
8051 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
8055 .addReg(Unmerge.getReg(0));
8059 .addReg(Unmerge.getReg(1));
8060 MI.eraseFromParent();
8072 case Intrinsic::sponentry:
8078 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8081 B.buildIntToPtr(DstReg, TmpReg);
8082 MI.eraseFromParent();
8084 int FI =
B.getMF().getFrameInfo().CreateFixedObject(
8086 B.buildFrameIndex(
MI.getOperand(0), FI);
8087 MI.eraseFromParent();
8090 case Intrinsic::amdgcn_if:
8091 case Intrinsic::amdgcn_else: {
8094 bool Negated =
false;
8106 std::swap(CondBrTarget, UncondBrTarget);
8108 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8109 if (IntrID == Intrinsic::amdgcn_if) {
8110 B.buildInstr(AMDGPU::SI_IF)
8113 .addMBB(UncondBrTarget);
8115 B.buildInstr(AMDGPU::SI_ELSE)
8118 .addMBB(UncondBrTarget);
8127 B.buildBr(*CondBrTarget);
8132 MI.eraseFromParent();
8133 BrCond->eraseFromParent();
8139 case Intrinsic::amdgcn_loop: {
8142 bool Negated =
false;
8152 std::swap(CondBrTarget, UncondBrTarget);
8154 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
8155 B.buildInstr(AMDGPU::SI_LOOP)
8157 .addMBB(UncondBrTarget);
8162 B.buildBr(*CondBrTarget);
8164 MI.eraseFromParent();
8165 BrCond->eraseFromParent();
8172 case Intrinsic::amdgcn_addrspacecast_nonnull:
8174 case Intrinsic::amdgcn_make_buffer_rsrc:
8176 case Intrinsic::amdgcn_kernarg_segment_ptr:
8179 B.buildConstant(
MI.getOperand(0).getReg(), 0);
8180 MI.eraseFromParent();
8186 case Intrinsic::amdgcn_implicitarg_ptr:
8188 case Intrinsic::amdgcn_workitem_id_x:
8191 case Intrinsic::amdgcn_workitem_id_y:
8194 case Intrinsic::amdgcn_workitem_id_z:
8197 case Intrinsic::amdgcn_workgroup_id_x:
8202 case Intrinsic::amdgcn_workgroup_id_y:
8207 case Intrinsic::amdgcn_workgroup_id_z:
8212 case Intrinsic::amdgcn_cluster_id_x:
8213 return ST.hasClusters() &&
8216 case Intrinsic::amdgcn_cluster_id_y:
8217 return ST.hasClusters() &&
8220 case Intrinsic::amdgcn_cluster_id_z:
8221 return ST.hasClusters() &&
8224 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8225 return ST.hasClusters() &&
8228 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8229 return ST.hasClusters() &&
8232 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8233 return ST.hasClusters() &&
8236 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8237 return ST.hasClusters() &&
8239 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8240 return ST.hasClusters() &&
8243 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8244 return ST.hasClusters() &&
8247 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8248 return ST.hasClusters() &&
8251 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8252 return ST.hasClusters() &&
8256 case Intrinsic::amdgcn_wave_id:
8258 case Intrinsic::amdgcn_lds_kernel_id:
8261 case Intrinsic::amdgcn_dispatch_ptr:
8264 case Intrinsic::amdgcn_queue_ptr:
8267 case Intrinsic::amdgcn_implicit_buffer_ptr:
8270 case Intrinsic::amdgcn_dispatch_id:
8273 case Intrinsic::r600_read_ngroups_x:
8277 case Intrinsic::r600_read_ngroups_y:
8280 case Intrinsic::r600_read_ngroups_z:
8283 case Intrinsic::r600_read_local_size_x:
8286 case Intrinsic::r600_read_local_size_y:
8290 case Intrinsic::r600_read_local_size_z:
8293 case Intrinsic::amdgcn_fdiv_fast:
8295 case Intrinsic::amdgcn_is_shared:
8297 case Intrinsic::amdgcn_is_private:
8299 case Intrinsic::amdgcn_wavefrontsize: {
8300 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
8301 MI.eraseFromParent();
8304 case Intrinsic::amdgcn_s_buffer_load:
8306 case Intrinsic::amdgcn_raw_buffer_store:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8308 case Intrinsic::amdgcn_struct_buffer_store:
8309 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8311 case Intrinsic::amdgcn_raw_buffer_store_format:
8312 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8313 case Intrinsic::amdgcn_struct_buffer_store_format:
8314 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8316 case Intrinsic::amdgcn_raw_tbuffer_store:
8317 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8318 case Intrinsic::amdgcn_struct_tbuffer_store:
8319 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8321 case Intrinsic::amdgcn_raw_buffer_load:
8322 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8323 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8324 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8325 case Intrinsic::amdgcn_struct_buffer_load:
8326 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8327 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8328 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8330 case Intrinsic::amdgcn_raw_buffer_load_format:
8331 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8332 case Intrinsic::amdgcn_struct_buffer_load_format:
8333 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8335 case Intrinsic::amdgcn_raw_tbuffer_load:
8336 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8337 case Intrinsic::amdgcn_struct_tbuffer_load:
8338 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8340 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8344 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8346 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8347 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8348 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8349 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8350 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8351 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8352 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8353 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8354 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8355 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8356 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8357 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8358 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8359 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8360 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8361 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8362 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8363 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8364 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8365 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8366 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8367 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8368 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8369 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8370 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8371 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8372 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8373 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8374 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8375 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8376 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8377 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8378 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8379 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8380 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8381 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8382 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8384 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8386 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8388 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8390 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8392 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8394 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8396 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8398 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8400 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8402 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8404 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8406 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8408 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8410 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8413 case Intrinsic::amdgcn_rsq_clamp:
8415 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8417 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8418 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8420 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8421 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8422 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8423 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8424 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8425 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8426 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8427 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8431 if (IndexArgTy !=
S64) {
8432 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(
S64, Index)
8433 :
B.buildAnyExt(
S64, Index);
8434 MI.getOperand(5).setReg(NewIndex.getReg(0));
8438 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8439 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8440 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8441 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8442 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8443 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8444 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8445 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8449 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8452 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8453 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8454 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8455 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8456 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8457 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8458 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8459 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8460 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8462 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8466 if (IndexArgTy != IdxTy) {
8467 auto NewIndex = IndexArgTy.
isVector() ?
B.buildBitcast(IdxTy, Index)
8468 :
B.buildAnyExt(IdxTy, Index);
8469 MI.getOperand(7).setReg(NewIndex.getReg(0));
8474 case Intrinsic::amdgcn_fmed3: {
8480 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8481 MI.removeOperand(1);
8485 case Intrinsic::amdgcn_readlane:
8486 case Intrinsic::amdgcn_writelane:
8487 case Intrinsic::amdgcn_readfirstlane:
8488 case Intrinsic::amdgcn_permlane16:
8489 case Intrinsic::amdgcn_permlanex16:
8490 case Intrinsic::amdgcn_permlane64:
8491 case Intrinsic::amdgcn_set_inactive:
8492 case Intrinsic::amdgcn_set_inactive_chain_arg:
8493 case Intrinsic::amdgcn_mov_dpp8:
8494 case Intrinsic::amdgcn_update_dpp:
8495 case Intrinsic::amdgcn_permlane_bcast:
8496 case Intrinsic::amdgcn_permlane_up:
8497 case Intrinsic::amdgcn_permlane_down:
8498 case Intrinsic::amdgcn_permlane_xor:
8500 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8502 case Intrinsic::amdgcn_dead: {
8506 MI.eraseFromParent();
8509 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8510 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8511 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8512 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8513 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8514 MI.eraseFromParent();
8516 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8517 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8518 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8519 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8520 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8521 MI.eraseFromParent();
8523 case Intrinsic::amdgcn_av_load_b128:
8524 case Intrinsic::amdgcn_av_store_b128: {
8526 if (!ST.hasFlatGlobalInsts()) {
8527 const char *Name = IntrID == Intrinsic::amdgcn_av_load_b128
8528 ?
"llvm.amdgcn.av.load.b128"
8529 :
"llvm.amdgcn.av.store.b128";
8532 Fn,
Twine(Name) +
" not supported on subtarget",
MI.getDebugLoc()));
8535 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8536 if (IntrID == Intrinsic::amdgcn_av_load_b128)
8537 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8539 B.buildStore(
MI.getOperand(2),
MI.getOperand(1),
8540 **
MI.memoperands_begin());
8541 MI.eraseFromParent();
8544 case Intrinsic::amdgcn_flat_load_monitor_b32:
8545 case Intrinsic::amdgcn_flat_load_monitor_b64:
8546 case Intrinsic::amdgcn_flat_load_monitor_b128:
8547 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8548 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8549 .add(
MI.getOperand(0))
8550 .add(
MI.getOperand(2))
8551 .addMemOperand(*
MI.memoperands_begin());
8552 MI.eraseFromParent();
8554 case Intrinsic::amdgcn_global_load_monitor_b32:
8555 case Intrinsic::amdgcn_global_load_monitor_b64:
8556 case Intrinsic::amdgcn_global_load_monitor_b128:
8557 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8558 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8559 .add(
MI.getOperand(0))
8560 .add(
MI.getOperand(2))
8561 .addMemOperand(*
MI.memoperands_begin());
8562 MI.eraseFromParent();
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
bool isModuleEntryFunction() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isBottomOfStack() const
bool isEntryFunction() const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
ConstantFP - Floating Point Values [float, double].
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.