34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
263 LLT Ty = Query.Types[TypeIdx];
271 const LLT QueryTy = Query.Types[TypeIdx];
283 const LLT Ty = Query.Types[TypeIdx];
285 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
293 bool IsLoad,
bool IsAtomic) {
297 return ST.enableFlatScratch() ? 128 : 32;
299 return ST.useDS128() ? 128 : 64;
310 return IsLoad ? 512 : 128;
315 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
324 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
329 unsigned AS = Query.
Types[1].getAddressSpace();
343 if (IsLoad && MemSize <
Size)
344 MemSize = std::max(MemSize,
Align);
353 AtomicOrdering::NotAtomic))
364 if (!ST.hasDwordx3LoadStores())
377 if (AlignBits < MemSize) {
380 Align(AlignBits / 8)))
424 return EltSize != 32 && EltSize != 64;
439 if (
Size != MemSizeInBits)
455 uint64_t AlignInBits,
unsigned AddrSpace,
465 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
476 if (AlignInBits < RoundedSize)
483 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
490 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
515 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
518 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
519 std::array<Register, 4> VectorElems;
520 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
521 for (
unsigned I = 0;
I < NumParts; ++
I)
523 B.buildExtractVectorElementConstant(S32, VectorReg,
I).getReg(0);
524 B.buildMergeValues(MO, VectorElems);
528 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
529 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
530 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
531 B.buildIntToPtr(MO, Scalar);
551 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
552 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
553 for (
unsigned I = 0;
I < NumParts; ++
I)
555 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
557 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
558 return B.buildBitcast(VectorTy, Scalar).getReg(0);
575 using namespace TargetOpcode;
577 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
621 std::initializer_list<LLT> AllS32Vectors =
622 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
623 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
624 std::initializer_list<LLT> AllS64Vectors =
625 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
637 const LLT CodePtr = FlatPtr;
639 const std::initializer_list<LLT> AddrSpaces64 = {
640 GlobalPtr, ConstantPtr, FlatPtr
643 const std::initializer_list<LLT> AddrSpaces32 = {
644 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
647 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
649 const std::initializer_list<LLT> FPTypesBase = {
653 const std::initializer_list<LLT> FPTypes16 = {
657 const std::initializer_list<LLT> FPTypesPK16 = {
669 .
legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
670 .legalFor(AllS32Vectors)
685 .legalFor({S32, S16, V2S16})
686 .clampMaxNumElementsStrict(0, S16, 2)
694 .clampMaxNumElementsStrict(0, S16, 2)
702 .legalFor({S32, S16, V2S16})
703 .minScalarOrElt(0, S16)
710 .legalFor({S32, S16})
720 .widenScalarToNextMultipleOf(0, 32)
727 .legalFor({S32, S16})
742 .widenScalarToNextMultipleOf(0, 32)
750 .widenScalarToNextMultipleOf(0, 32);
755 Mul.maxScalar(0, S32);
761 .minScalarOrElt(0, S32)
780 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
781 .customFor({S32, S64})
782 .clampScalar(0, S32, S64)
792 .clampMaxNumElements(0, S8, 2)
803 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
804 .clampScalar(0, S32, S64)
811 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
812 .legalFor({{S32, S1}, {S32, S32}})
813 .clampScalar(0, S32, S32)
823 .
legalFor({S1, S32, S64, S16, GlobalPtr,
824 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
831 .clampScalar(0, S16, S64);
861 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
862 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
863 .legalFor({S32, S64});
865 .customFor({S32, S64});
885 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
919 .legalFor(FPTypesPK16)
927 .customFor({S32, S64})
933 .clampScalar(0, S16, S64);
936 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
938 .maxScalarIf(
typeIs(0, S16), 1, S16)
939 .clampScalar(1, S32, S32)
943 .
customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
956 .legalFor({S32, S64})
958 .clampScalar(0, S32, S64);
963 .clampScalar(0, S32, S64);
967 .legalFor({{S32, S32}, {S64, S32}})
969 .clampScalar(0, S32, S64)
970 .clampScalar(1, S32, S32)
977 .clampScalar(1, S32, S32)
988 .narrowScalarFor({{S64, S16}},
changeTo(0, S32))
997 .lowerFor({S64, V2S16});
1003 .lowerFor({S64, S16, V2S16});
1013 FMad.customFor({S32, S16});
1015 FMad.customFor({S32});
1017 FMad.customFor({S16});
1023 FRem.customFor({S16, S32, S64});
1025 FRem.minScalar(0, S32)
1026 .customFor({S32, S64});
1034 .clampMaxNumElements(0, S16, 2)
1042 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1043 {S32, S1}, {S64, S1}, {S16, S1}})
1045 .clampScalar(0, S32, S64)
1046 .widenScalarToNextPow2(1, 32);
1050 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1061 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1062 .customFor({{S64, S32}, {S64, S64}})
1063 .narrowScalarFor({{S64, S16}},
changeTo(0, S32));
1074 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1080 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1084 if (
ST.has16BitInsts()) {
1085 getActionDefinitionsBuilder(
1086 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1087 .legalFor({S16, S32, S64})
1088 .clampScalar(0, S16, S64)
1091 getActionDefinitionsBuilder(
1092 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1093 .legalFor({S32, S64})
1094 .clampScalar(0, S32, S64)
1097 getActionDefinitionsBuilder(
1098 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1101 .clampScalar(0, S32, S64)
1105 getActionDefinitionsBuilder(G_PTR_ADD)
1106 .unsupportedFor({BufferFatPtr, RsrcPtr})
1109 .scalarSameSizeAs(1, 0);
1111 getActionDefinitionsBuilder(G_PTRMASK)
1113 .scalarSameSizeAs(1, 0)
1117 getActionDefinitionsBuilder(G_ICMP)
1128 .legalForCartesianProduct(
1129 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1130 .legalForCartesianProduct(
1131 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1132 if (
ST.has16BitInsts()) {
1133 CmpBuilder.legalFor({{S1, S16}});
1137 .widenScalarToNextPow2(1)
1138 .clampScalar(1, S32, S64)
1143 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1144 {S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1146 if (
ST.hasSALUFloatInsts())
1147 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1150 .widenScalarToNextPow2(1)
1151 .clampScalar(1, S32, S64)
1155 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1156 if (
ST.has16BitInsts())
1157 ExpOps.customFor({{S32}, {S16}});
1159 ExpOps.customFor({S32});
1160 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1163 getActionDefinitionsBuilder(G_FPOWI)
1164 .clampScalar(0, MinScalarFPTy, S32)
1167 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1168 Log2Ops.customFor({S32});
1169 if (
ST.has16BitInsts())
1170 Log2Ops.legalFor({S16});
1172 Log2Ops.customFor({S16});
1173 Log2Ops.scalarize(0)
1177 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1178 LogOps.customFor({S32, S16});
1179 LogOps.clampScalar(0, MinScalarFPTy, S32)
1183 getActionDefinitionsBuilder(G_CTPOP)
1184 .legalFor({{S32, S32}, {S32, S64}})
1185 .clampScalar(0, S32, S32)
1186 .widenScalarToNextPow2(1, 32)
1187 .clampScalar(1, S32, S64)
1189 .widenScalarToNextPow2(0, 32);
1192 if (
ST.has16BitInsts())
1193 getActionDefinitionsBuilder(G_IS_FPCLASS)
1194 .legalForCartesianProduct({S1}, FPTypes16)
1195 .widenScalarToNextPow2(1)
1199 getActionDefinitionsBuilder(G_IS_FPCLASS)
1200 .legalForCartesianProduct({S1}, FPTypesBase)
1201 .lowerFor({S1, S16})
1202 .widenScalarToNextPow2(1)
1209 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1211 .clampScalar(0, S32, S32)
1212 .clampScalar(1, S32, S64)
1213 .widenScalarToNextPow2(0, 32)
1214 .widenScalarToNextPow2(1, 32)
1218 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1219 .legalFor({{S32, S32}, {S32, S64}})
1220 .clampScalar(0, S32, S32)
1221 .clampScalar(1, S32, S64)
1223 .widenScalarToNextPow2(0, 32)
1224 .widenScalarToNextPow2(1, 32);
1228 getActionDefinitionsBuilder(G_BITREVERSE)
1229 .legalFor({S32, S64})
1230 .clampScalar(0, S32, S64)
1232 .widenScalarToNextPow2(0);
1234 if (
ST.has16BitInsts()) {
1235 getActionDefinitionsBuilder(G_BSWAP)
1236 .legalFor({S16, S32, V2S16})
1237 .clampMaxNumElementsStrict(0, S16, 2)
1240 .widenScalarToNextPow2(0)
1241 .clampScalar(0, S16, S32)
1244 if (
ST.hasVOP3PInsts()) {
1245 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1246 .legalFor({S32, S16, V2S16})
1247 .clampMaxNumElements(0, S16, 2)
1249 .widenScalarToNextPow2(0)
1253 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1254 .legalFor({S32, S16})
1255 .widenScalarToNextPow2(0)
1262 getActionDefinitionsBuilder(G_BSWAP)
1267 .widenScalarToNextPow2(0)
1272 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1275 .widenScalarToNextPow2(0)
1280 getActionDefinitionsBuilder(G_INTTOPTR)
1282 .legalForCartesianProduct(AddrSpaces64, {S64})
1283 .legalForCartesianProduct(AddrSpaces32, {S32})
1296 getActionDefinitionsBuilder(G_PTRTOINT)
1298 .legalForCartesianProduct(AddrSpaces64, {S64})
1299 .legalForCartesianProduct(AddrSpaces32, {S32})
1312 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1316 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1317 bool IsLoad) ->
bool {
1321 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1335 unsigned NumRegs = (MemSize + 31) / 32;
1337 if (!
ST.hasDwordx3LoadStores())
1348 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1349 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1350 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1356 for (
unsigned Op : {G_LOAD, G_STORE}) {
1357 const bool IsStore =
Op == G_STORE;
1359 auto &Actions = getActionDefinitionsBuilder(
Op);
1362 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1363 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1364 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1365 {S64, GlobalPtr, S64, GlobalAlign32},
1366 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1367 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1368 {S32, GlobalPtr, S8, GlobalAlign8},
1369 {S32, GlobalPtr, S16, GlobalAlign16},
1371 {S32, LocalPtr, S32, 32},
1372 {S64, LocalPtr, S64, 32},
1373 {V2S32, LocalPtr, V2S32, 32},
1374 {S32, LocalPtr, S8, 8},
1375 {S32, LocalPtr, S16, 16},
1376 {V2S16, LocalPtr, S32, 32},
1378 {S32, PrivatePtr, S32, 32},
1379 {S32, PrivatePtr, S8, 8},
1380 {S32, PrivatePtr, S16, 16},
1381 {V2S16, PrivatePtr, S32, 32},
1383 {S32, ConstantPtr, S32, GlobalAlign32},
1384 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1385 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1386 {S64, ConstantPtr, S64, GlobalAlign32},
1387 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1396 Actions.unsupportedIf(
typeInSet(1, {BufferFatPtr, RsrcPtr}));
1410 Actions.customIf(
typeIs(1, Constant32Ptr));
1436 return !Query.
Types[0].isVector() &&
1437 needToSplitMemOp(Query,
Op == G_LOAD);
1439 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1444 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1447 if (DstSize > MemSize)
1453 if (MemSize > MaxSize)
1461 return Query.
Types[0].isVector() &&
1462 needToSplitMemOp(Query,
Op == G_LOAD);
1464 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1478 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1479 if (MemSize > MaxSize) {
1483 if (MaxSize % EltSize == 0) {
1489 unsigned NumPieces = MemSize / MaxSize;
1493 if (NumPieces == 1 || NumPieces >= NumElts ||
1494 NumElts % NumPieces != 0)
1495 return std::pair(0, EltTy);
1503 return std::pair(0, EltTy);
1518 return std::pair(0, EltTy);
1522 .widenScalarToNextPow2(0)
1528 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1529 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1530 {S32, GlobalPtr, S16, 2 * 8},
1531 {S32, LocalPtr, S8, 8},
1532 {S32, LocalPtr, S16, 16},
1533 {S32, PrivatePtr, S8, 8},
1534 {S32, PrivatePtr, S16, 16},
1535 {S32, ConstantPtr, S8, 8},
1536 {S32, ConstantPtr, S16, 2 * 8}})
1542 if (
ST.hasFlatAddressSpace()) {
1543 ExtLoads.legalForTypesWithMemDesc(
1544 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1552 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1554 ExtLoads.clampScalar(0, S32, S32)
1555 .widenScalarToNextPow2(0)
1558 auto &Atomics = getActionDefinitionsBuilder(
1559 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1560 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1561 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1562 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1563 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1564 {S64, GlobalPtr}, {S64, LocalPtr},
1565 {S32, RegionPtr}, {S64, RegionPtr}});
1566 if (
ST.hasFlatAddressSpace()) {
1567 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1570 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1571 if (
ST.hasLDSFPAtomicAdd()) {
1572 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1573 if (
ST.hasGFX90AInsts())
1574 Atomic.legalFor({{S64, LocalPtr}});
1575 if (
ST.hasAtomicDsPkAdd16Insts())
1576 Atomic.legalFor({{V2S16, LocalPtr}});
1578 if (
ST.hasAtomicFaddInsts())
1579 Atomic.legalFor({{S32, GlobalPtr}});
1580 if (
ST.hasFlatAtomicFaddF32Inst())
1581 Atomic.legalFor({{S32, FlatPtr}});
1583 if (
ST.hasGFX90AInsts()) {
1596 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1597 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1598 {S32, FlatPtr}, {S64, FlatPtr}})
1599 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1600 {S32, RegionPtr}, {S64, RegionPtr}});
1604 getActionDefinitionsBuilder(G_SELECT)
1605 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1606 LocalPtr, FlatPtr, PrivatePtr,
1610 .clampScalar(0, S16, S64)
1614 .clampMaxNumElements(0, S32, 2)
1615 .clampMaxNumElements(0, LocalPtr, 2)
1616 .clampMaxNumElements(0, PrivatePtr, 2)
1618 .widenScalarToNextPow2(0)
1623 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1624 .legalFor({{S32, S32}, {S64, S32}});
1625 if (
ST.has16BitInsts()) {
1626 if (
ST.hasVOP3PInsts()) {
1627 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1628 .clampMaxNumElements(0, S16, 2);
1630 Shifts.legalFor({{S16, S16}});
1633 Shifts.widenScalarIf(
1638 const LLT AmountTy = Query.
Types[1];
1642 Shifts.maxScalarIf(
typeIs(0, S16), 1, S16);
1643 Shifts.clampScalar(1, S32, S32);
1644 Shifts.widenScalarToNextPow2(0, 16);
1645 Shifts.clampScalar(0, S16, S64);
1647 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1655 Shifts.clampScalar(1, S32, S32);
1656 Shifts.widenScalarToNextPow2(0, 32);
1657 Shifts.clampScalar(0, S32, S64);
1659 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1664 Shifts.scalarize(0);
1666 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1667 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1668 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1669 unsigned IdxTypeIdx = 2;
1671 getActionDefinitionsBuilder(
Op)
1673 const LLT EltTy = Query.
Types[EltTypeIdx];
1674 const LLT VecTy = Query.
Types[VecTypeIdx];
1675 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1677 const bool isLegalVecType =
1687 return (EltSize == 32 || EltSize == 64) &&
1702 const LLT EltTy = Query.
Types[EltTypeIdx];
1703 const LLT VecTy = Query.
Types[VecTypeIdx];
1707 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1712 .clampScalar(EltTypeIdx, S32, S64)
1713 .clampScalar(VecTypeIdx, S32, S64)
1714 .clampScalar(IdxTypeIdx, S32, S32)
1715 .clampMaxNumElements(VecTypeIdx, S32, 32)
1725 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1727 const LLT &EltTy = Query.
Types[1].getElementType();
1728 return Query.
Types[0] != EltTy;
1731 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1732 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1733 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1736 getActionDefinitionsBuilder(
Op)
1742 const LLT BigTy = Query.
Types[BigTyIdx];
1747 const LLT BigTy = Query.
Types[BigTyIdx];
1748 const LLT LitTy = Query.
Types[LitTyIdx];
1754 const LLT BigTy = Query.
Types[BigTyIdx];
1760 const LLT LitTy = Query.
Types[LitTyIdx];
1765 .widenScalarToNextPow2(BigTyIdx, 32);
1769 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1770 .legalForCartesianProduct(AllS32Vectors, {S32})
1771 .legalForCartesianProduct(AllS64Vectors, {S64})
1772 .clampNumElements(0, V16S32, V32S32)
1773 .clampNumElements(0, V2S64, V16S64)
1779 if (
ST.hasScalarPackInsts()) {
1782 .minScalarOrElt(0, S16)
1785 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1786 .legalFor({V2S16, S32})
1789 BuildVector.customFor({V2S16, S16});
1790 BuildVector.minScalarOrElt(0, S32);
1792 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1793 .customFor({V2S16, S32})
1800 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1802 .clampMaxNumElements(0, S32, 32)
1803 .clampMaxNumElements(1, S16, 2)
1804 .clampMaxNumElements(0, S16, 64);
1806 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1809 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1810 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1811 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1813 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1814 const LLT Ty = Query.
Types[TypeIdx];
1825 auto &Builder = getActionDefinitionsBuilder(
Op)
1827 .lowerFor({{S16, V2S16}})
1829 const LLT BigTy = Query.
Types[BigTyIdx];
1835 .widenScalarToNextPow2(LitTyIdx, 16)
1843 .clampScalar(LitTyIdx, S32, S512)
1844 .widenScalarToNextPow2(LitTyIdx, 32)
1847 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1850 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1852 .clampScalar(BigTyIdx, S32, MaxScalar);
1854 if (
Op == G_MERGE_VALUES) {
1855 Builder.widenScalarIf(
1858 const LLT Ty = Query.
Types[LitTyIdx];
1864 Builder.widenScalarIf(
1866 const LLT Ty = Query.
Types[BigTyIdx];
1872 const LLT &Ty = Query.
Types[BigTyIdx];
1874 if (NewSizeInBits >= 256) {
1876 if (RoundedTo < NewSizeInBits)
1877 NewSizeInBits = RoundedTo;
1879 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1888 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1889 .legalFor({{S32}, {S64}});
1891 if (
ST.hasVOP3PInsts()) {
1892 SextInReg.lowerFor({{V2S16}})
1896 .clampMaxNumElementsStrict(0, S16, 2);
1897 }
else if (
ST.has16BitInsts()) {
1898 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1902 SextInReg.lowerFor({{S32}, {S64}});
1907 .clampScalar(0, S32, S64)
1910 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1915 getActionDefinitionsBuilder(G_FSHR)
1916 .legalFor({{S32, S32}})
1917 .lowerFor({{V2S16, V2S16}})
1918 .clampMaxNumElementsStrict(0, S16, 2)
1922 if (
ST.hasVOP3PInsts()) {
1923 getActionDefinitionsBuilder(G_FSHL)
1924 .lowerFor({{V2S16, V2S16}})
1925 .clampMaxNumElementsStrict(0, S16, 2)
1929 getActionDefinitionsBuilder(G_FSHL)
1934 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1937 getActionDefinitionsBuilder(G_FENCE)
1940 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1945 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1946 .legalFor({{S32, S32}, {S64, S32}})
1947 .clampScalar(1, S32, S32)
1948 .clampScalar(0, S32, S64)
1949 .widenScalarToNextPow2(0)
1952 getActionDefinitionsBuilder({
1956 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1965 G_FMINIMUM, G_FMAXIMUM}).lower();
1967 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1970 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1971 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1972 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1975 getLegacyLegalizerInfo().computeTables();
1984 switch (
MI.getOpcode()) {
1985 case TargetOpcode::G_ADDRSPACE_CAST:
1987 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
1989 case TargetOpcode::G_FCEIL:
1991 case TargetOpcode::G_FREM:
1993 case TargetOpcode::G_INTRINSIC_TRUNC:
1995 case TargetOpcode::G_SITOFP:
1997 case TargetOpcode::G_UITOFP:
1999 case TargetOpcode::G_FPTOSI:
2001 case TargetOpcode::G_FPTOUI:
2003 case TargetOpcode::G_FMINNUM:
2004 case TargetOpcode::G_FMAXNUM:
2005 case TargetOpcode::G_FMINNUM_IEEE:
2006 case TargetOpcode::G_FMAXNUM_IEEE:
2008 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2010 case TargetOpcode::G_INSERT_VECTOR_ELT:
2012 case TargetOpcode::G_FSIN:
2013 case TargetOpcode::G_FCOS:
2015 case TargetOpcode::G_GLOBAL_VALUE:
2017 case TargetOpcode::G_LOAD:
2018 case TargetOpcode::G_SEXTLOAD:
2019 case TargetOpcode::G_ZEXTLOAD:
2021 case TargetOpcode::G_STORE:
2023 case TargetOpcode::G_FMAD:
2025 case TargetOpcode::G_FDIV:
2027 case TargetOpcode::G_FFREXP:
2029 case TargetOpcode::G_FSQRT:
2031 case TargetOpcode::G_UDIV:
2032 case TargetOpcode::G_UREM:
2033 case TargetOpcode::G_UDIVREM:
2035 case TargetOpcode::G_SDIV:
2036 case TargetOpcode::G_SREM:
2037 case TargetOpcode::G_SDIVREM:
2039 case TargetOpcode::G_ATOMIC_CMPXCHG:
2041 case TargetOpcode::G_FLOG2:
2043 case TargetOpcode::G_FLOG:
2044 case TargetOpcode::G_FLOG10:
2046 case TargetOpcode::G_FEXP2:
2048 case TargetOpcode::G_FEXP:
2049 case TargetOpcode::G_FEXP10:
2051 case TargetOpcode::G_FPOW:
2053 case TargetOpcode::G_FFLOOR:
2055 case TargetOpcode::G_BUILD_VECTOR:
2056 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2058 case TargetOpcode::G_MUL:
2060 case TargetOpcode::G_CTLZ:
2061 case TargetOpcode::G_CTTZ:
2063 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2065 case TargetOpcode::G_STACKSAVE:
2085 if (ST.hasApertureRegs()) {
2090 ? AMDGPU::SRC_SHARED_BASE
2091 : AMDGPU::SRC_PRIVATE_BASE;
2099 Register Dst =
MRI.createGenericVirtualRegister(S64);
2100 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2101 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2102 return B.buildUnmerge(S32, Dst).getReg(1);
2107 Register LoadAddr =
MRI.createGenericVirtualRegister(
2117 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2119 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2133 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2136 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2139 Register QueuePtr =
MRI.createGenericVirtualRegister(
2155 B.buildPtrAdd(LoadAddr, QueuePtr,
2156 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2157 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2165 switch (Def->getOpcode()) {
2166 case AMDGPU::G_FRAME_INDEX:
2167 case AMDGPU::G_GLOBAL_VALUE:
2168 case AMDGPU::G_BLOCK_ADDR:
2170 case AMDGPU::G_CONSTANT: {
2171 const ConstantInt *CI = Def->getOperand(1).getCImm();
2190 LLT DstTy =
MRI.getType(Dst);
2191 LLT SrcTy =
MRI.getType(Src);
2202 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2203 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2212 B.buildExtract(Dst, Src, 0);
2213 MI.eraseFromParent();
2217 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2219 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2220 auto FlatNull =
B.buildConstant(SrcTy, 0);
2223 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2227 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2229 MI.eraseFromParent();
2241 Register SrcAsInt =
B.buildPtrToInt(S32, Src).getReg(0);
2245 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2248 B.buildCopy(Dst, BuildPtr);
2249 MI.eraseFromParent();
2253 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2254 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2257 SegmentNull.getReg(0));
2259 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2261 MI.eraseFromParent();
2268 B.buildExtract(Dst, Src, 0);
2269 MI.eraseFromParent();
2277 auto PtrLo =
B.buildPtrToInt(S32, Src);
2278 auto HighAddr =
B.buildConstant(S32, AddrHiVal);
2279 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2280 MI.eraseFromParent();
2285 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2288 Ctx.
diagnose(InvalidAddrSpaceCast);
2290 MI.eraseFromParent();
2298 LLT Ty =
MRI.getType(Src);
2304 auto C1 =
B.buildFConstant(Ty, C1Val);
2305 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2308 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2309 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2311 auto C2 =
B.buildFConstant(Ty, C2Val);
2312 auto Fabs =
B.buildFAbs(Ty, Src);
2315 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2316 MI.eraseFromParent();
2334 auto Trunc =
B.buildIntrinsicTrunc(S64, Src);
2336 const auto Zero =
B.buildFConstant(S64, 0.0);
2337 const auto One =
B.buildFConstant(S64, 1.0);
2340 auto And =
B.buildAnd(S1, Lt0, NeTrunc);
2341 auto Add =
B.buildSelect(S64,
And, One, Zero);
2344 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2345 MI.eraseFromParent();
2353 Register Src0Reg =
MI.getOperand(1).getReg();
2354 Register Src1Reg =
MI.getOperand(2).getReg();
2355 auto Flags =
MI.getFlags();
2356 LLT Ty =
MRI.getType(DstReg);
2358 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2359 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2360 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2361 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2362 MI.eraseFromParent();
2368 const unsigned FractBits = 52;
2369 const unsigned ExpBits = 11;
2372 auto Const0 =
B.buildConstant(S32, FractBits - 32);
2373 auto Const1 =
B.buildConstant(S32, ExpBits);
2375 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2377 .addUse(Const0.getReg(0))
2378 .addUse(Const1.getReg(0));
2380 return B.buildSub(S32, ExpPart,
B.buildConstant(S32, 1023));
2394 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2401 const unsigned FractBits = 52;
2404 const auto SignBitMask =
B.buildConstant(S32, UINT32_C(1) << 31);
2405 auto SignBit =
B.buildAnd(S32,
Hi, SignBitMask);
2407 const auto FractMask =
B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2409 const auto Zero32 =
B.buildConstant(S32, 0);
2412 auto SignBit64 =
B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2414 auto Shr =
B.buildAShr(S64, FractMask, Exp);
2415 auto Not =
B.buildNot(S64, Shr);
2416 auto Tmp0 =
B.buildAnd(S64, Src, Not);
2417 auto FiftyOne =
B.buildConstant(S32, FractBits - 1);
2422 auto Tmp1 =
B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2423 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2424 MI.eraseFromParent();
2440 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2441 auto ThirtyTwo =
B.buildConstant(S32, 32);
2443 if (
MRI.getType(Dst) == S64) {
2444 auto CvtHi =
Signed ?
B.buildSITOFP(S64, Unmerge.getReg(1))
2445 :
B.buildUITOFP(S64, Unmerge.getReg(1));
2447 auto CvtLo =
B.buildUITOFP(S64, Unmerge.getReg(0));
2448 auto LdExp =
B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2451 B.buildFAdd(Dst, LdExp, CvtLo);
2452 MI.eraseFromParent();
2458 auto One =
B.buildConstant(S32, 1);
2462 auto ThirtyOne =
B.buildConstant(S32, 31);
2463 auto X =
B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2464 auto OppositeSign =
B.buildAShr(S32,
X, ThirtyOne);
2465 auto MaxShAmt =
B.buildAdd(S32, ThirtyTwo, OppositeSign);
2466 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2467 .addUse(Unmerge.getReg(1));
2468 auto LS2 =
B.buildSub(S32, LS, One);
2469 ShAmt =
B.buildUMin(S32, LS2, MaxShAmt);
2471 ShAmt =
B.buildCTLZ(S32, Unmerge.getReg(1));
2472 auto Norm =
B.buildShl(S64, Src, ShAmt);
2473 auto Unmerge2 =
B.buildUnmerge({S32, S32}, Norm);
2474 auto Adjust =
B.buildUMin(S32, One, Unmerge2.getReg(0));
2475 auto Norm2 =
B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2476 auto FVal =
Signed ?
B.buildSITOFP(S32, Norm2) :
B.buildUITOFP(S32, Norm2);
2477 auto Scale =
B.buildSub(S32, ThirtyTwo, ShAmt);
2478 B.buildFLdexp(Dst, FVal, Scale);
2479 MI.eraseFromParent();
2496 const LLT SrcLT =
MRI.getType(Src);
2497 assert((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64);
2499 unsigned Flags =
MI.getFlags();
2510 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2512 if (
Signed && SrcLT == S32) {
2518 Sign =
B.buildAShr(S32, Src,
B.buildConstant(S32, 31));
2519 Trunc =
B.buildFAbs(S32, Trunc, Flags);
2523 K0 =
B.buildFConstant(
2524 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2525 K1 =
B.buildFConstant(
2526 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2528 K0 =
B.buildFConstant(
2529 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2530 K1 =
B.buildFConstant(
2531 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2534 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2535 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2536 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2538 auto Hi = (
Signed && SrcLT == S64) ?
B.buildFPTOSI(S32, FloorMul)
2539 :
B.buildFPTOUI(S32, FloorMul);
2540 auto Lo =
B.buildFPTOUI(S32, Fma);
2542 if (
Signed && SrcLT == S32) {
2544 Sign =
B.buildMergeLikeInstr(S64, {Sign, Sign});
2546 B.buildSub(Dst,
B.buildXor(S64,
B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2549 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2550 MI.eraseFromParent();
2560 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2561 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2584 LLT VecTy =
MRI.getType(Vec);
2597 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2598 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2599 B.buildIntToPtr(Dst, IntElt);
2601 MI.eraseFromParent();
2608 std::optional<ValueAndVReg> MaybeIdxVal =
2612 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2615 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2616 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2621 MI.eraseFromParent();
2636 LLT VecTy =
MRI.getType(Vec);
2650 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2651 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2652 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2654 B.buildIntToPtr(Dst, IntVecDest);
2655 MI.eraseFromParent();
2662 std::optional<ValueAndVReg> MaybeIdxVal =
2667 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2670 if (IdxVal < NumElts) {
2672 for (
unsigned i = 0; i < NumElts; ++i)
2673 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2674 B.buildUnmerge(SrcRegs, Vec);
2676 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2677 B.buildMergeLikeInstr(Dst, SrcRegs);
2682 MI.eraseFromParent();
2692 LLT Ty =
MRI.getType(DstReg);
2693 unsigned Flags =
MI.getFlags();
2698 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2699 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2700 .addUse(MulVal.getReg(0))
2704 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2707 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2711 MI.eraseFromParent();
2719 unsigned GAFlags)
const {
2720 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2748 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2759 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2760 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2763 B.buildExtract(DstReg, PCReg, 0);
2777 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2779 :
MRI.createGenericVirtualRegister(S32);
2781 if (!
MRI.getRegClassOrNull(AddrLo))
2782 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2785 B.buildInstr(AMDGPU::S_MOV_B32)
2790 if (RequiresHighHalf) {
2792 "Must provide a 64-bit pointer type!");
2794 Register AddrHi =
MRI.createGenericVirtualRegister(S32);
2795 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2797 B.buildInstr(AMDGPU::S_MOV_B32)
2807 if (!
MRI.getRegClassOrNull(AddrDst))
2808 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2810 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2814 if (AddrDst != DstReg)
2815 B.buildCast(DstReg, AddrDst);
2816 }
else if (AddrLo != DstReg) {
2819 B.buildCast(DstReg, AddrLo);
2827 LLT Ty =
MRI.getType(DstReg);
2839 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2849 B.buildUndef(DstReg);
2850 MI.eraseFromParent();
2870 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2874 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2875 B.buildIntToPtr(DstReg, Sz);
2876 MI.eraseFromParent();
2882 *cast<GlobalVariable>(GV)));
2883 MI.eraseFromParent();
2889 MI.eraseFromParent();
2897 MI.eraseFromParent();
2903 MI.eraseFromParent();
2908 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
2921 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2922 B.buildExtract(DstReg, Load, 0);
2924 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2926 MI.eraseFromParent();
2944 LLT PtrTy =
MRI.getType(PtrReg);
2949 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
2951 MI.getOperand(1).setReg(Cast.getReg(0));
2956 if (
MI.getOpcode() != AMDGPU::G_LOAD)
2960 LLT ValTy =
MRI.getType(ValReg);
2982 if (WideMemSize == ValSize) {
2988 MI.setMemRefs(MF, {WideMMO});
2994 if (ValSize > WideMemSize)
3001 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3002 B.buildTrunc(ValReg, WideLoad).getReg(0);
3009 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3010 B.buildExtract(ValReg, WideLoad, 0);
3014 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3015 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3019 MI.eraseFromParent();
3032 Register DataReg =
MI.getOperand(0).getReg();
3033 LLT DataTy =
MRI.getType(DataReg);
3047 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3076 "this should not have been custom lowered");
3078 LLT ValTy =
MRI.getType(CmpVal);
3081 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3083 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3087 .setMemRefs(
MI.memoperands());
3089 MI.eraseFromParent();
3098 case TargetOpcode::G_INTRINSIC: {
3100 case Intrinsic::amdgcn_frexp_mant:
3108 case TargetOpcode::G_FFREXP: {
3113 case TargetOpcode::G_FPEXT: {
3137std::pair<Register, Register>
3139 unsigned Flags)
const {
3144 auto SmallestNormal =
B.buildFConstant(
3146 auto IsLtSmallestNormal =
3149 auto Scale32 =
B.buildFConstant(F32, 0x1.0p+32);
3150 auto One =
B.buildFConstant(F32, 1.0);
3152 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3153 auto ScaledInput =
B.buildFMul(F32, Src, ScaleFactor, Flags);
3155 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3168 LLT Ty =
B.getMRI()->getType(Dst);
3169 unsigned Flags =
MI.getFlags();
3174 auto Ext =
B.buildFPExt(F32, Src, Flags);
3175 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3176 .addUse(Ext.getReg(0))
3178 B.buildFPTrunc(Dst,
Log2, Flags);
3179 MI.eraseFromParent();
3187 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3190 MI.eraseFromParent();
3194 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3195 .addUse(ScaledInput)
3198 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3199 auto Zero =
B.buildFConstant(Ty, 0.0);
3201 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3202 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3204 MI.eraseFromParent();
3210 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3211 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3216 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3217 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3222 unsigned Flags =
MI.getFlags();
3223 const LLT Ty =
MRI.getType(
X);
3233 TM.Options.ApproxFuncFPMath ||
TM.Options.UnsafeFPMath) {
3235 Register LogVal =
MRI.createGenericVirtualRegister(F32);
3236 auto PromoteSrc =
B.buildFPExt(F32,
X);
3238 B.buildFPTrunc(Dst, LogVal);
3243 MI.eraseFromParent();
3252 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3257 const float c_log10 = 0x1.344134p-2f;
3258 const float cc_log10 = 0x1.09f79ep-26f;
3261 const float c_log = 0x1.62e42ep-1f;
3262 const float cc_log = 0x1.efa39ep-25f;
3264 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3265 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3267 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3268 auto NegR =
B.buildFNeg(Ty, R, Flags);
3269 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3270 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3271 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3274 const float ch_log10 = 0x1.344000p-2f;
3275 const float ct_log10 = 0x1.3509f6p-18f;
3278 const float ch_log = 0x1.62e000p-1f;
3279 const float ct_log = 0x1.0bfbe8p-15f;
3281 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3282 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3284 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3285 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3286 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3287 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3290 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3292 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3295 const bool IsFiniteOnly =
3299 if (!IsFiniteOnly) {
3302 auto Fabs =
B.buildFAbs(Ty,
Y);
3305 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3309 auto Zero =
B.buildFConstant(Ty, 0.0);
3311 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3312 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3313 B.buildFSub(Dst, R, Shift, Flags);
3315 B.buildCopy(Dst, R);
3318 MI.eraseFromParent();
3324 unsigned Flags)
const {
3325 const double Log2BaseInverted =
3328 LLT Ty =
B.getMRI()->getType(Dst);
3333 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3336 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3337 auto Zero =
B.buildFConstant(Ty, 0.0);
3339 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3340 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3343 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3345 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3346 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3354 ?
B.buildFLog2(Ty, Src, Flags)
3355 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3358 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3359 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3370 unsigned Flags =
MI.getFlags();
3371 LLT Ty =
B.getMRI()->getType(Dst);
3377 auto Ext =
B.buildFPExt(F32, Src, Flags);
3378 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3379 .addUse(Ext.getReg(0))
3381 B.buildFPTrunc(Dst,
Log2, Flags);
3382 MI.eraseFromParent();
3392 MI.eraseFromParent();
3400 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3402 RangeCheckConst, Flags);
3404 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3405 auto Zero =
B.buildFConstant(Ty, 0.0);
3406 auto AddOffset =
B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3407 auto AddInput =
B.buildFAdd(F32, Src, AddOffset, Flags);
3409 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3410 .addUse(AddInput.getReg(0))
3413 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3414 auto One =
B.buildFConstant(Ty, 1.0);
3415 auto ResultScale =
B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3416 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3417 MI.eraseFromParent();
3423 LLT Ty =
B.getMRI()->getType(Dst);
3428 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3432 .addUse(
Mul.getReg(0))
3435 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3441 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3444 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3445 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3446 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3449 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3451 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3452 .addUse(ExpInput.getReg(0))
3455 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3456 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3457 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3465 const unsigned Flags =
MI.getFlags();
3468 LLT Ty =
MRI.getType(Dst);
3471 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3478 MI.eraseFromParent();
3486 auto Ext =
B.buildFPExt(F32,
X, Flags);
3487 Register Lowered =
MRI.createGenericVirtualRegister(F32);
3489 B.buildFPTrunc(Dst, Lowered, Flags);
3490 MI.eraseFromParent();
3500 MI.eraseFromParent();
3528 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3533 const float cc_exp = 0x1.4ae0bep-26f;
3534 const float c_exp10 = 0x1.a934f0p+1f;
3535 const float cc_exp10 = 0x1.2f346ep-24f;
3537 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3538 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3539 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3540 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3542 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3543 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3545 const float ch_exp = 0x1.714000p+0f;
3546 const float cl_exp = 0x1.47652ap-12f;
3548 const float ch_exp10 = 0x1.a92000p+1f;
3549 const float cl_exp10 = 0x1.4f0978p-11f;
3551 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3552 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3553 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3555 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3556 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3558 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3559 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3562 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3563 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3566 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3569 auto PHSubE =
B.buildFSub(Ty, PH,
E, FlagsNoContract);
3570 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3573 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3574 .addUse(
A.getReg(0))
3576 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3578 auto UnderflowCheckConst =
3579 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3580 auto Zero =
B.buildFConstant(Ty, 0.0);
3584 R =
B.buildSelect(Ty, Underflow, Zero, R);
3589 auto OverflowCheckConst =
3590 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3595 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3598 B.buildCopy(Dst, R);
3599 MI.eraseFromParent();
3608 unsigned Flags =
MI.getFlags();
3609 LLT Ty =
B.getMRI()->getType(Dst);
3614 auto Log =
B.buildFLog2(F32, Src0, Flags);
3615 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3616 .addUse(Log.getReg(0))
3619 B.buildFExp2(Dst,
Mul, Flags);
3620 }
else if (Ty == F16) {
3622 auto Log =
B.buildFLog2(F16, Src0, Flags);
3623 auto Ext0 =
B.buildFPExt(F32, Log, Flags);
3624 auto Ext1 =
B.buildFPExt(F32, Src1, Flags);
3625 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3626 .addUse(Ext0.getReg(0))
3627 .addUse(Ext1.getReg(0))
3629 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3633 MI.eraseFromParent();
3641 ModSrc = SrcFNeg->getOperand(1).getReg();
3643 ModSrc = SrcFAbs->getOperand(1).getReg();
3645 ModSrc = SrcFAbs->getOperand(1).getReg();
3656 Register OrigSrc =
MI.getOperand(1).getReg();
3657 unsigned Flags =
MI.getFlags();
3659 "this should not have been custom lowered");
3669 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3681 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3683 Register Min =
MRI.createGenericVirtualRegister(F64);
3689 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3691 B.buildFMinNum(Min, Fract, Const, Flags);
3696 CorrectedFract =
B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3699 auto NegFract =
B.buildFNeg(F64, CorrectedFract, Flags);
3700 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3702 MI.eraseFromParent();
3718 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3720 Src0 =
B.buildTrunc(S16,
MI.getOperand(1).getReg()).getReg(0);
3721 Src1 =
B.buildTrunc(S16,
MI.getOperand(2).getReg()).getReg(0);
3724 auto Merge =
B.buildMergeLikeInstr(S32, {Src0, Src1});
3725 B.buildBitcast(Dst,
Merge);
3727 MI.eraseFromParent();
3744 bool UsePartialMad64_32,
3745 bool SeparateOddAlignedProducts)
const {
3760 auto getZero32 = [&]() ->
Register {
3762 Zero32 =
B.buildConstant(S32, 0).getReg(0);
3765 auto getZero64 = [&]() ->
Register {
3767 Zero64 =
B.buildConstant(S64, 0).getReg(0);
3772 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3783 if (CarryIn.empty())
3786 bool HaveCarryOut =
true;
3788 if (CarryIn.size() == 1) {
3790 LocalAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
3794 CarryAccum = getZero32();
3796 CarryAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
3797 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3799 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3804 LocalAccum = getZero32();
3805 HaveCarryOut =
false;
3810 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3811 LocalAccum =
Add.getReg(0);
3825 auto buildMadChain =
3828 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3829 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3836 if (LocalAccum.size() == 1 &&
3837 (!UsePartialMad64_32 || !CarryIn.empty())) {
3840 unsigned j1 = DstIndex - j0;
3841 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3845 auto Mul =
B.buildMul(S32, Src0[j0], Src1[j1]);
3847 LocalAccum[0] =
Mul.getReg(0);
3849 if (CarryIn.empty()) {
3850 LocalAccum[0] =
B.buildAdd(S32, LocalAccum[0],
Mul).getReg(0);
3853 B.buildUAdde(S32, S1, LocalAccum[0],
Mul, CarryIn.back())
3859 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3863 if (j0 <= DstIndex) {
3864 bool HaveSmallAccum =
false;
3867 if (LocalAccum[0]) {
3868 if (LocalAccum.size() == 1) {
3869 Tmp =
B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3870 HaveSmallAccum =
true;
3871 }
else if (LocalAccum[1]) {
3872 Tmp =
B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3873 HaveSmallAccum =
false;
3875 Tmp =
B.buildZExt(S64, LocalAccum[0]).getReg(0);
3876 HaveSmallAccum =
true;
3879 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3881 HaveSmallAccum =
true;
3885 unsigned j1 = DstIndex - j0;
3886 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3890 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3891 {Src0[j0], Src1[j1], Tmp});
3892 Tmp = Mad.getReg(0);
3893 if (!HaveSmallAccum)
3894 CarryOut.push_back(Mad.getReg(1));
3895 HaveSmallAccum =
false;
3898 }
while (j0 <= DstIndex);
3900 auto Unmerge =
B.buildUnmerge(S32, Tmp);
3901 LocalAccum[0] = Unmerge.getReg(0);
3902 if (LocalAccum.size() > 1)
3903 LocalAccum[1] = Unmerge.getReg(1);
3930 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
3931 Carry OddCarryIn = std::move(OddCarry);
3932 Carry EvenCarryIn = std::move(EvenCarry);
3937 if (2 * i < Accum.
size()) {
3938 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
3939 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3944 if (!SeparateOddAlignedProducts) {
3945 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
3946 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3948 bool IsHighest = 2 * i >= Accum.
size();
3952 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3958 Lo =
B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3960 Lo =
B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3962 Lo =
B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3965 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
3968 auto Hi =
B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3969 Lo->getOperand(1).getReg());
3970 Accum[2 * i] =
Hi.getReg(0);
3971 SeparateOddCarry =
Hi.getReg(1);
3978 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3979 EvenCarryIn.push_back(CarryOut);
3981 if (2 * i < Accum.
size()) {
3982 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3983 OddCarry.push_back(CarryOut);
3996 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4005 LLT Ty =
MRI.getType(DstReg);
4009 unsigned NumParts =
Size / 32;
4025 for (
unsigned i = 0; i < NumParts; ++i) {
4026 Src0Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
4027 Src1Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
4029 B.buildUnmerge(Src0Parts, Src0);
4030 B.buildUnmerge(Src1Parts, Src1);
4033 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4034 SeparateOddAlignedProducts);
4036 B.buildMergeLikeInstr(DstReg, AccumRegs);
4037 MI.eraseFromParent();
4049 LLT DstTy =
MRI.getType(Dst);
4050 LLT SrcTy =
MRI.getType(Src);
4052 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4053 ? AMDGPU::G_AMDGPU_FFBH_U32
4054 : AMDGPU::G_AMDGPU_FFBL_B32;
4055 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4058 MI.eraseFromParent();
4064 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4067 return ConstVal && *ConstVal == -1;
4074 Register CondDef =
MI.getOperand(0).getReg();
4075 if (!
MRI.hasOneNonDBGUse(CondDef))
4083 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4089 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4098 if (Next == Parent->
end()) {
4102 UncondBrTarget = &*NextMBB;
4104 if (Next->getOpcode() != AMDGPU::G_BR)
4122 *ArgRC,
B.getDebugLoc(), ArgTy);
4126 const unsigned Mask = Arg->
getMask();
4127 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4134 auto ShiftAmt =
B.buildConstant(S32, Shift);
4135 AndMaskSrc =
B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4138 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(S32, Mask >> Shift));
4140 B.buildCopy(DstReg, LiveIn);
4159 B.buildConstant(DstReg, 0);
4165 B.buildUndef(DstReg);
4169 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4180 MI.eraseFromParent();
4186 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4187 MI.eraseFromParent();
4208 B.buildUndef(DstReg);
4209 MI.eraseFromParent();
4213 if (Arg->isMasked()) {
4227 MI.eraseFromParent();
4234 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4244 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4252 Align Alignment)
const {
4256 "unexpected kernarg parameter type");
4260 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4263 MI.eraseFromParent();
4271 LLT DstTy =
MRI.getType(Dst);
4298 auto FloatY =
B.buildUITOFP(S32,
Y);
4299 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4300 auto Scale =
B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4301 auto ScaledY =
B.buildFMul(S32, RcpIFlag, Scale);
4302 auto Z =
B.buildFPTOUI(S32, ScaledY);
4305 auto NegY =
B.buildSub(S32,
B.buildConstant(S32, 0),
Y);
4306 auto NegYZ =
B.buildMul(S32, NegY, Z);
4307 Z =
B.buildAdd(S32, Z,
B.buildUMulH(S32, Z, NegYZ));
4310 auto Q =
B.buildUMulH(S32,
X, Z);
4311 auto R =
B.buildSub(S32,
X,
B.buildMul(S32, Q,
Y));
4314 auto One =
B.buildConstant(S32, 1);
4317 Q =
B.buildSelect(S32,
Cond,
B.buildAdd(S32, Q, One), Q);
4318 R =
B.buildSelect(S32,
Cond,
B.buildSub(S32, R,
Y), R);
4323 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(S32, Q, One), Q);
4326 B.buildSelect(DstRemReg,
Cond,
B.buildSub(S32, R,
Y), R);
4345 auto Unmerge =
B.buildUnmerge(S32, Val);
4347 auto CvtLo =
B.buildUITOFP(S32, Unmerge.getReg(0));
4348 auto CvtHi =
B.buildUITOFP(S32, Unmerge.getReg(1));
4350 auto Mad =
B.buildFMAD(
4352 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4354 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4355 auto Mul1 =
B.buildFMul(
4356 S32, Rcp,
B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4359 auto Mul2 =
B.buildFMul(
4360 S32, Mul1,
B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4361 auto Trunc =
B.buildIntrinsicTrunc(S32, Mul2);
4364 auto Mad2 =
B.buildFMAD(
4365 S32, Trunc,
B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4368 auto ResultLo =
B.buildFPTOUI(S32, Mad2);
4369 auto ResultHi =
B.buildFPTOUI(S32, Trunc);
4371 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4386 auto Rcp =
B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4388 auto Zero64 =
B.buildConstant(S64, 0);
4389 auto NegDenom =
B.buildSub(S64, Zero64, Denom);
4391 auto MulLo1 =
B.buildMul(S64, NegDenom, Rcp);
4392 auto MulHi1 =
B.buildUMulH(S64, Rcp, MulLo1);
4394 auto UnmergeMulHi1 =
B.buildUnmerge(S32, MulHi1);
4395 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4396 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4398 auto Add1_Lo =
B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4399 auto Add1_Hi =
B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4400 auto Add1 =
B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4402 auto MulLo2 =
B.buildMul(S64, NegDenom, Add1);
4403 auto MulHi2 =
B.buildUMulH(S64, Add1, MulLo2);
4404 auto UnmergeMulHi2 =
B.buildUnmerge(S32, MulHi2);
4405 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4406 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4408 auto Zero32 =
B.buildConstant(S32, 0);
4409 auto Add2_Lo =
B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4410 auto Add2_Hi =
B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4411 auto Add2 =
B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4413 auto UnmergeNumer =
B.buildUnmerge(S32, Numer);
4414 Register NumerLo = UnmergeNumer.getReg(0);
4415 Register NumerHi = UnmergeNumer.getReg(1);
4417 auto MulHi3 =
B.buildUMulH(S64, Numer, Add2);
4418 auto Mul3 =
B.buildMul(S64, Denom, MulHi3);
4419 auto UnmergeMul3 =
B.buildUnmerge(S32, Mul3);
4420 Register Mul3_Lo = UnmergeMul3.getReg(0);
4421 Register Mul3_Hi = UnmergeMul3.getReg(1);
4422 auto Sub1_Lo =
B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4423 auto Sub1_Hi =
B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4424 auto Sub1_Mi =
B.buildSub(S32, NumerHi, Mul3_Hi);
4425 auto Sub1 =
B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4427 auto UnmergeDenom =
B.buildUnmerge(S32, Denom);
4428 Register DenomLo = UnmergeDenom.getReg(0);
4429 Register DenomHi = UnmergeDenom.getReg(1);
4432 auto C1 =
B.buildSExt(S32, CmpHi);
4435 auto C2 =
B.buildSExt(S32, CmpLo);
4438 auto C3 =
B.buildSelect(S32, CmpEq, C2, C1);
4445 auto Sub2_Lo =
B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4446 auto Sub2_Mi =
B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4447 auto Sub2_Hi =
B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4448 auto Sub2 =
B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4450 auto One64 =
B.buildConstant(S64, 1);
4451 auto Add3 =
B.buildAdd(S64, MulHi3, One64);
4457 auto C6 =
B.buildSelect(
4461 auto Add4 =
B.buildAdd(S64, Add3, One64);
4462 auto Sub3_Lo =
B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4464 auto Sub3_Mi =
B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4465 auto Sub3_Hi =
B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4466 auto Sub3 =
B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4472 auto Sel1 =
B.buildSelect(
4479 auto Sel2 =
B.buildSelect(
4490 switch (
MI.getOpcode()) {
4493 case AMDGPU::G_UDIV: {
4494 DstDivReg =
MI.getOperand(0).getReg();
4497 case AMDGPU::G_UREM: {
4498 DstRemReg =
MI.getOperand(0).getReg();
4501 case AMDGPU::G_UDIVREM: {
4502 DstDivReg =
MI.getOperand(0).getReg();
4503 DstRemReg =
MI.getOperand(1).getReg();
4510 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4511 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4512 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4513 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4522 MI.eraseFromParent();
4532 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4533 if (Ty != S32 && Ty != S64)
4536 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4540 auto SignBitOffset =
B.buildConstant(S32, Ty.
getSizeInBits() - 1);
4541 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4542 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4544 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4545 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4547 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4548 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4550 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4551 switch (
MI.getOpcode()) {
4554 case AMDGPU::G_SDIV: {
4555 DstDivReg =
MI.getOperand(0).getReg();
4556 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4559 case AMDGPU::G_SREM: {
4560 DstRemReg =
MI.getOperand(0).getReg();
4561 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4564 case AMDGPU::G_SDIVREM: {
4565 DstDivReg =
MI.getOperand(0).getReg();
4566 DstRemReg =
MI.getOperand(1).getReg();
4567 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4568 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4579 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4580 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4581 B.buildSub(DstDivReg, SignXor, Sign);
4585 auto Sign = LHSign.getReg(0);
4586 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4587 B.buildSub(DstRemReg, SignXor, Sign);
4590 MI.eraseFromParent();
4601 LLT ResTy =
MRI.getType(Res);
4608 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4619 if (CLHS->isExactlyValue(1.0)) {
4620 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4624 MI.eraseFromParent();
4629 if (CLHS->isExactlyValue(-1.0)) {
4630 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4631 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4632 .addUse(FNeg.getReg(0))
4635 MI.eraseFromParent();
4642 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4647 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4650 B.buildFMul(Res,
LHS, RCP, Flags);
4652 MI.eraseFromParent();
4663 LLT ResTy =
MRI.getType(Res);
4669 if (!AllowInaccurateRcp)
4672 auto NegY =
B.buildFNeg(ResTy,
Y);
4673 auto One =
B.buildFConstant(ResTy, 1.0);
4675 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4679 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4680 R =
B.buildFMA(ResTy, Tmp0, R, R);
4682 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4683 R =
B.buildFMA(ResTy, Tmp1, R, R);
4685 auto Ret =
B.buildFMul(ResTy,
X, R);
4686 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4688 B.buildFMA(Res, Tmp2, R, Ret);
4689 MI.eraseFromParent();
4708 auto LHSExt =
B.buildFPExt(S32,
LHS, Flags);
4709 auto RHSExt =
B.buildFPExt(S32,
RHS, Flags);
4711 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4712 .addUse(RHSExt.getReg(0))
4715 auto QUOT =
B.buildFMul(S32, LHSExt, RCP, Flags);
4716 auto RDst =
B.buildFPTrunc(S16, QUOT, Flags);
4718 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4719 .addUse(RDst.getReg(0))
4724 MI.eraseFromParent();
4738 unsigned SPDenormMode =
4741 if (ST.hasDenormModeInst()) {
4743 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4745 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4746 B.buildInstr(AMDGPU::S_DENORM_MODE)
4747 .addImm(NewDenormModeValue);
4750 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4751 .addImm(SPDenormMode)
4773 auto One =
B.buildFConstant(S32, 1.0f);
4775 auto DenominatorScaled =
4776 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4781 auto NumeratorScaled =
4782 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4788 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4789 .addUse(DenominatorScaled.getReg(0))
4791 auto NegDivScale0 =
B.buildFNeg(S32, DenominatorScaled, Flags);
4794 const bool HasDynamicDenormals =
4799 if (!PreservesDenormals) {
4800 if (HasDynamicDenormals) {
4801 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4802 B.buildInstr(AMDGPU::S_GETREG_B32)
4803 .addDef(SavedSPDenormMode)
4809 auto Fma0 =
B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4810 auto Fma1 =
B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4811 auto Mul =
B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4812 auto Fma2 =
B.buildFMA(S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
4813 auto Fma3 =
B.buildFMA(S32, Fma2, Fma1,
Mul, Flags);
4814 auto Fma4 =
B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4816 if (!PreservesDenormals) {
4817 if (HasDynamicDenormals) {
4818 assert(SavedSPDenormMode);
4819 B.buildInstr(AMDGPU::S_SETREG_B32)
4820 .addReg(SavedSPDenormMode)
4826 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4827 .addUse(Fma4.getReg(0))
4828 .addUse(Fma1.getReg(0))
4829 .addUse(Fma3.getReg(0))
4830 .addUse(NumeratorScaled.getReg(1))
4833 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4834 .addUse(Fmas.getReg(0))
4839 MI.eraseFromParent();
4858 auto One =
B.buildFConstant(S64, 1.0);
4860 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4866 auto NegDivScale0 =
B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4868 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4869 .addUse(DivScale0.getReg(0))
4872 auto Fma0 =
B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4873 auto Fma1 =
B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4874 auto Fma2 =
B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4876 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4882 auto Fma3 =
B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4883 auto Mul =
B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4884 auto Fma4 =
B.buildFMA(S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
4893 auto NumUnmerge =
B.buildUnmerge(S32,
LHS);
4894 auto DenUnmerge =
B.buildUnmerge(S32,
RHS);
4895 auto Scale0Unmerge =
B.buildUnmerge(S32, DivScale0);
4896 auto Scale1Unmerge =