28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
31#define DEBUG_TYPE "amdgpu-legalinfo"
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
41 "amdgpu-global-isel-new-legality",
42 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
68 const LLT Ty = Query.Types[TypeIdx];
75 EltSize > 1 && EltSize < 32 &&
82 const LLT Ty = Query.Types[TypeIdx];
89 const LLT Ty = Query.Types[TypeIdx];
97 const LLT Ty = Query.Types[TypeIdx];
99 return std::pair(TypeIdx,
106 const LLT Ty = Query.Types[TypeIdx];
109 unsigned Pieces = (
Size + 63) / 64;
120 const LLT Ty = Query.Types[TypeIdx];
125 const int NextMul32 = (
Size + 31) / 32;
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
137 const LLT Ty = Query.Types[TypeIdx];
142 assert(EltSize == 32 || EltSize == 64);
147 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
170 const LLT Ty = Query.Types[TypeIdx];
177 const LLT Ty = Query.Types[TypeIdx];
187 const LLT QueryTy = Query.Types[TypeIdx];
194 const LLT QueryTy = Query.Types[TypeIdx];
201 const LLT QueryTy = Query.Types[TypeIdx];
212 return EltSize == 16 || EltSize % 32 == 0;
217 return EltSize == 32 || EltSize == 64 ||
219 EltSize == 128 || EltSize == 256;
243 LLT Ty = Query.Types[TypeIdx];
251 const LLT QueryTy = Query.Types[TypeIdx];
263 const LLT Ty = Query.Types[TypeIdx];
265 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
273 bool IsLoad,
bool IsAtomic) {
277 return ST.enableFlatScratch() ? 128 : 32;
279 return ST.useDS128() ? 128 : 64;
289 return IsLoad ? 512 : 128;
294 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
303 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
308 unsigned AS = Query.
Types[1].getAddressSpace();
322 if (IsLoad && MemSize <
Size)
323 MemSize = std::max(MemSize,
Align);
332 AtomicOrdering::NotAtomic))
343 if (!ST.hasDwordx3LoadStores())
356 if (AlignBits < MemSize) {
359 Align(AlignBits / 8)))
385 return EltSize != 32 && EltSize != 64;
400 if (
Size != MemSizeInBits)
416 uint64_t AlignInBits,
unsigned AddrSpace,
426 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
437 if (AlignInBits < RoundedSize)
444 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
451 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
456 Query.
Types[1].getAddressSpace(), Opcode);
462 using namespace TargetOpcode;
464 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
508 std::initializer_list<LLT> AllS32Vectors =
509 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
510 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
511 std::initializer_list<LLT> AllS64Vectors =
512 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
522 const LLT CodePtr = FlatPtr;
524 const std::initializer_list<LLT> AddrSpaces64 = {
525 GlobalPtr, ConstantPtr, FlatPtr
528 const std::initializer_list<LLT> AddrSpaces32 = {
529 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
532 const std::initializer_list<LLT> FPTypesBase = {
536 const std::initializer_list<LLT> FPTypes16 = {
540 const std::initializer_list<LLT> FPTypesPK16 = {
552 .
legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
553 .legalFor(AllS32Vectors)
567 .legalFor({S32, S16, V2S16})
568 .clampMaxNumElementsStrict(0, S16, 2)
576 .clampMaxNumElementsStrict(0, S16, 2)
584 .legalFor({S32, S16, V2S16})
585 .minScalarOrElt(0, S16)
592 .legalFor({S32, S16})
602 .widenScalarToNextMultipleOf(0, 32)
609 .legalFor({S32, S16})
624 .widenScalarToNextMultipleOf(0, 32)
632 .widenScalarToNextMultipleOf(0, 32);
637 Mul.maxScalar(0, S32);
643 .minScalarOrElt(0, S32)
662 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
663 .customFor({S32, S64})
664 .clampScalar(0, S32, S64)
674 .clampMaxNumElements(0, S8, 2)
685 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
686 .clampScalar(0, S32, S64)
693 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
694 .legalFor({{S32, S1}, {S32, S32}})
695 .clampScalar(0, S32, S32)
705 .
legalFor({S1, S32, S64, S16, GlobalPtr,
706 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
713 .clampScalar(0, S16, S64);
738 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
739 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
740 .legalFor({S32, S64});
742 .customFor({S32, S64});
757 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
791 .legalFor(FPTypesPK16)
798 .legalFor({S32, S64, S16})
800 .clampScalar(0, S16, S64);
805 .clampScalar(0, S32, S64);
810 .legalFor({S32, S64})
812 .clampScalar(0, S32, S64);
817 .clampScalar(0, S32, S64);
828 .narrowScalarFor({{S64, S16}},
changeTo(0, S32))
837 .lowerFor({S64, V2S16});
843 .lowerFor({S64, S16, V2S16});
853 FMad.customFor({S32, S16});
855 FMad.customFor({S32});
857 FMad.customFor({S16});
863 FRem.customFor({S16, S32, S64});
865 FRem.minScalar(0, S32)
866 .customFor({S32, S64});
874 .clampMaxNumElements(0, S16, 2)
882 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
883 {S32, S1}, {S64, S1}, {S16, S1}})
885 .clampScalar(0, S32, S64)
886 .widenScalarToNextPow2(1, 32);
890 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
901 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
902 .customFor({{S64, S32}, {S64, S64}})
903 .narrowScalarFor({{S64, S16}},
changeTo(0, S32));
914 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
920 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
924 if (
ST.has16BitInsts()) {
925 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
926 .legalFor({S16, S32, S64})
927 .clampScalar(0, S16, S64)
930 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
931 .legalFor({S32, S64})
932 .clampScalar(0, S32, S64)
935 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
938 .clampScalar(0, S32, S64)
942 getActionDefinitionsBuilder(G_PTR_ADD)
945 .scalarSameSizeAs(1, 0);
947 getActionDefinitionsBuilder(G_PTRMASK)
949 .scalarSameSizeAs(1, 0)
953 getActionDefinitionsBuilder(G_ICMP)
964 .legalForCartesianProduct(
965 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
966 .legalForCartesianProduct(
967 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
968 if (
ST.has16BitInsts()) {
969 CmpBuilder.legalFor({{S1, S16}});
973 .widenScalarToNextPow2(1)
974 .clampScalar(1, S32, S64)
978 getActionDefinitionsBuilder(G_FCMP)
979 .legalForCartesianProduct({S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
980 .widenScalarToNextPow2(1)
981 .clampScalar(1, S32, S64)
985 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
986 if (
ST.has16BitInsts())
987 Exp2Ops.legalFor({S32, S16});
989 Exp2Ops.legalFor({S32});
990 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
991 Exp2Ops.scalarize(0);
993 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
994 if (
ST.has16BitInsts())
995 ExpOps.customFor({{S32}, {S16}});
997 ExpOps.customFor({S32});
998 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1001 getActionDefinitionsBuilder(G_FPOWI)
1002 .clampScalar(0, MinScalarFPTy, S32)
1006 getActionDefinitionsBuilder(G_CTPOP)
1007 .legalFor({{S32, S32}, {S32, S64}})
1008 .clampScalar(0, S32, S32)
1009 .widenScalarToNextPow2(1, 32)
1010 .clampScalar(1, S32, S64)
1012 .widenScalarToNextPow2(0, 32);
1015 if (
ST.has16BitInsts())
1016 getActionDefinitionsBuilder(G_IS_FPCLASS)
1017 .legalForCartesianProduct({S1}, FPTypes16)
1018 .widenScalarToNextPow2(1)
1022 getActionDefinitionsBuilder(G_IS_FPCLASS)
1023 .legalForCartesianProduct({S1}, FPTypesBase)
1024 .lowerFor({S1, S16})
1025 .widenScalarToNextPow2(1)
1032 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1034 .clampScalar(0, S32, S32)
1035 .clampScalar(1, S32, S64)
1036 .widenScalarToNextPow2(0, 32)
1037 .widenScalarToNextPow2(1, 32)
1041 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1042 .legalFor({{S32, S32}, {S32, S64}})
1043 .clampScalar(0, S32, S32)
1044 .clampScalar(1, S32, S64)
1046 .widenScalarToNextPow2(0, 32)
1047 .widenScalarToNextPow2(1, 32);
1051 getActionDefinitionsBuilder(G_BITREVERSE)
1052 .legalFor({S32, S64})
1053 .clampScalar(0, S32, S64)
1055 .widenScalarToNextPow2(0);
1057 if (
ST.has16BitInsts()) {
1058 getActionDefinitionsBuilder(G_BSWAP)
1059 .legalFor({S16, S32, V2S16})
1060 .clampMaxNumElementsStrict(0, S16, 2)
1063 .widenScalarToNextPow2(0)
1064 .clampScalar(0, S16, S32)
1067 if (
ST.hasVOP3PInsts()) {
1068 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1069 .legalFor({S32, S16, V2S16})
1071 .clampMaxNumElements(0, S16, 2)
1073 .widenScalarToNextPow2(0)
1077 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1078 .legalFor({S32, S16})
1079 .widenScalarToNextPow2(0)
1086 getActionDefinitionsBuilder(G_BSWAP)
1091 .widenScalarToNextPow2(0)
1096 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1099 .widenScalarToNextPow2(0)
1104 getActionDefinitionsBuilder(G_INTTOPTR)
1106 .legalForCartesianProduct(AddrSpaces64, {S64})
1107 .legalForCartesianProduct(AddrSpaces32, {S32})
1120 getActionDefinitionsBuilder(G_PTRTOINT)
1122 .legalForCartesianProduct(AddrSpaces64, {S64})
1123 .legalForCartesianProduct(AddrSpaces32, {S32})
1136 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1140 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1141 bool IsLoad) ->
bool {
1145 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1159 unsigned NumRegs = (MemSize + 31) / 32;
1161 if (!
ST.hasDwordx3LoadStores())
1172 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1173 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1174 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1180 for (
unsigned Op : {G_LOAD, G_STORE}) {
1181 const bool IsStore =
Op == G_STORE;
1183 auto &Actions = getActionDefinitionsBuilder(Op);
1186 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1187 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1188 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1189 {S64, GlobalPtr, S64, GlobalAlign32},
1190 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1191 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1192 {S32, GlobalPtr, S8, GlobalAlign8},
1193 {S32, GlobalPtr, S16, GlobalAlign16},
1195 {S32, LocalPtr, S32, 32},
1196 {S64, LocalPtr, S64, 32},
1197 {V2S32, LocalPtr, V2S32, 32},
1198 {S32, LocalPtr, S8, 8},
1199 {S32, LocalPtr, S16, 16},
1200 {V2S16, LocalPtr, S32, 32},
1202 {S32, PrivatePtr, S32, 32},
1203 {S32, PrivatePtr, S8, 8},
1204 {S32, PrivatePtr, S16, 16},
1205 {V2S16, PrivatePtr, S32, 32},
1207 {S32, ConstantPtr, S32, GlobalAlign32},
1208 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1209 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1210 {S64, ConstantPtr, S64, GlobalAlign32},
1211 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1222 Actions.customIf(
typeIs(1, Constant32Ptr));
1248 return !Query.
Types[0].isVector() &&
1249 needToSplitMemOp(Query, Op == G_LOAD);
1251 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1256 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1259 if (DstSize > MemSize)
1265 if (MemSize > MaxSize)
1273 return Query.
Types[0].isVector() &&
1274 needToSplitMemOp(Query, Op == G_LOAD);
1276 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1290 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1291 if (MemSize > MaxSize) {
1295 if (MaxSize % EltSize == 0) {
1301 unsigned NumPieces = MemSize / MaxSize;
1305 if (NumPieces == 1 || NumPieces >= NumElts ||
1306 NumElts % NumPieces != 0)
1307 return std::pair(0, EltTy);
1315 return std::pair(0, EltTy);
1330 return std::pair(0, EltTy);
1334 .widenScalarToNextPow2(0)
1340 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1341 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1342 {S32, GlobalPtr, S16, 2 * 8},
1343 {S32, LocalPtr, S8, 8},
1344 {S32, LocalPtr, S16, 16},
1345 {S32, PrivatePtr, S8, 8},
1346 {S32, PrivatePtr, S16, 16},
1347 {S32, ConstantPtr, S8, 8},
1348 {S32, ConstantPtr, S16, 2 * 8}})
1354 if (
ST.hasFlatAddressSpace()) {
1355 ExtLoads.legalForTypesWithMemDesc(
1356 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1364 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1366 ExtLoads.clampScalar(0, S32, S32)
1367 .widenScalarToNextPow2(0)
1370 auto &Atomics = getActionDefinitionsBuilder(
1371 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1372 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1373 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1374 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1375 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1376 {S64, GlobalPtr}, {S64, LocalPtr},
1377 {S32, RegionPtr}, {S64, RegionPtr}});
1378 if (
ST.hasFlatAddressSpace()) {
1379 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1382 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1383 if (
ST.hasLDSFPAtomicAdd()) {
1384 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1385 if (
ST.hasGFX90AInsts())
1386 Atomic.legalFor({{S64, LocalPtr}});
1387 if (
ST.hasAtomicDsPkAdd16Insts())
1388 Atomic.legalFor({{V2S16, LocalPtr}});
1390 if (
ST.hasAtomicFaddInsts())
1391 Atomic.legalFor({{S32, GlobalPtr}});
1392 if (
ST.hasFlatAtomicFaddF32Inst())
1393 Atomic.legalFor({{S32, FlatPtr}});
1395 if (
ST.hasGFX90AInsts()) {
1408 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1409 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1410 {S32, FlatPtr}, {S64, FlatPtr}})
1411 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1412 {S32, RegionPtr}, {S64, RegionPtr}});
1416 getActionDefinitionsBuilder(G_SELECT)
1417 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1418 LocalPtr, FlatPtr, PrivatePtr,
1422 .clampScalar(0, S16, S64)
1426 .clampMaxNumElements(0, S32, 2)
1427 .clampMaxNumElements(0, LocalPtr, 2)
1428 .clampMaxNumElements(0, PrivatePtr, 2)
1430 .widenScalarToNextPow2(0)
1435 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1436 .legalFor({{S32, S32}, {S64, S32}});
1437 if (
ST.has16BitInsts()) {
1438 if (
ST.hasVOP3PInsts()) {
1439 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1440 .clampMaxNumElements(0, S16, 2);
1442 Shifts.legalFor({{S16, S16}});
1445 Shifts.widenScalarIf(
1450 const LLT AmountTy = Query.
Types[1];
1454 Shifts.maxScalarIf(
typeIs(0, S16), 1, S16);
1455 Shifts.clampScalar(1, S32, S32);
1456 Shifts.widenScalarToNextPow2(0, 16);
1457 Shifts.clampScalar(0, S16, S64);
1459 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1467 Shifts.clampScalar(1, S32, S32);
1468 Shifts.widenScalarToNextPow2(0, 32);
1469 Shifts.clampScalar(0, S32, S64);
1471 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1476 Shifts.scalarize(0);
1478 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1479 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1480 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1481 unsigned IdxTypeIdx = 2;
1483 getActionDefinitionsBuilder(Op)
1485 const LLT EltTy = Query.
Types[EltTypeIdx];
1486 const LLT VecTy = Query.
Types[VecTypeIdx];
1487 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1489 const bool isLegalVecType =
1491 return (EltSize == 32 || EltSize == 64) &&
1506 const LLT EltTy = Query.
Types[EltTypeIdx];
1507 const LLT VecTy = Query.
Types[VecTypeIdx];
1511 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1516 .clampScalar(EltTypeIdx, S32, S64)
1517 .clampScalar(VecTypeIdx, S32, S64)
1518 .clampScalar(IdxTypeIdx, S32, S32)
1519 .clampMaxNumElements(VecTypeIdx, S32, 32)
1529 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1531 const LLT &EltTy = Query.
Types[1].getElementType();
1532 return Query.
Types[0] != EltTy;
1535 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1536 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1537 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1540 getActionDefinitionsBuilder(Op)
1546 const LLT BigTy = Query.
Types[BigTyIdx];
1551 const LLT BigTy = Query.
Types[BigTyIdx];
1552 const LLT LitTy = Query.
Types[LitTyIdx];
1558 const LLT BigTy = Query.
Types[BigTyIdx];
1564 const LLT LitTy = Query.
Types[LitTyIdx];
1569 .widenScalarToNextPow2(BigTyIdx, 32);
1573 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1574 .legalForCartesianProduct(AllS32Vectors, {S32})
1575 .legalForCartesianProduct(AllS64Vectors, {S64})
1576 .clampNumElements(0, V16S32, V32S32)
1577 .clampNumElements(0, V2S64, V16S64)
1583 if (
ST.hasScalarPackInsts()) {
1586 .minScalarOrElt(0, S16)
1589 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1590 .legalFor({V2S16, S32})
1593 BuildVector.customFor({V2S16, S16});
1594 BuildVector.minScalarOrElt(0, S32);
1596 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1597 .customFor({V2S16, S32})
1604 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1606 .clampMaxNumElements(0, S32, 32)
1607 .clampMaxNumElements(1, S16, 2)
1608 .clampMaxNumElements(0, S16, 64);
1610 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1613 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1614 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1615 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1617 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1618 const LLT Ty = Query.
Types[TypeIdx];
1629 auto &
Builder = getActionDefinitionsBuilder(Op)
1631 .lowerFor({{S16, V2S16}})
1633 const LLT BigTy = Query.
Types[BigTyIdx];
1639 .widenScalarToNextPow2(LitTyIdx, 16)
1647 .clampScalar(LitTyIdx, S32, S512)
1648 .widenScalarToNextPow2(LitTyIdx, 32)
1651 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1654 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1656 .clampScalar(BigTyIdx, S32, MaxScalar);
1658 if (Op == G_MERGE_VALUES) {
1662 const LLT Ty = Query.
Types[LitTyIdx];
1670 const LLT Ty = Query.
Types[BigTyIdx];
1676 const LLT &Ty = Query.
Types[BigTyIdx];
1678 if (NewSizeInBits >= 256) {
1680 if (RoundedTo < NewSizeInBits)
1681 NewSizeInBits = RoundedTo;
1683 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1692 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1693 .legalFor({{S32}, {S64}});
1695 if (
ST.hasVOP3PInsts()) {
1696 SextInReg.lowerFor({{V2S16}})
1700 .clampMaxNumElementsStrict(0, S16, 2);
1701 }
else if (
ST.has16BitInsts()) {
1702 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1706 SextInReg.lowerFor({{S32}, {S64}});
1711 .clampScalar(0, S32, S64)
1714 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1719 getActionDefinitionsBuilder(G_FSHR)
1720 .legalFor({{S32, S32}})
1721 .lowerFor({{V2S16, V2S16}})
1722 .clampMaxNumElementsStrict(0, S16, 2)
1726 if (
ST.hasVOP3PInsts()) {
1727 getActionDefinitionsBuilder(G_FSHL)
1728 .lowerFor({{V2S16, V2S16}})
1729 .clampMaxNumElementsStrict(0, S16, 2)
1733 getActionDefinitionsBuilder(G_FSHL)
1738 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1741 getActionDefinitionsBuilder(G_FENCE)
1744 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1749 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1750 .legalFor({{S32, S32}, {S64, S32}})
1751 .clampScalar(1, S32, S32)
1752 .clampScalar(0, S32, S64)
1753 .widenScalarToNextPow2(0)
1756 getActionDefinitionsBuilder({
1760 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1769 G_FMINIMUM, G_FMAXIMUM}).lower();
1771 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1774 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1775 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1776 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1779 getLegacyLegalizerInfo().computeTables();
1788 switch (
MI.getOpcode()) {
1789 case TargetOpcode::G_ADDRSPACE_CAST:
1791 case TargetOpcode::G_FRINT:
1793 case TargetOpcode::G_FCEIL:
1795 case TargetOpcode::G_FREM:
1797 case TargetOpcode::G_INTRINSIC_TRUNC:
1799 case TargetOpcode::G_SITOFP:
1801 case TargetOpcode::G_UITOFP:
1803 case TargetOpcode::G_FPTOSI:
1805 case TargetOpcode::G_FPTOUI:
1807 case TargetOpcode::G_FMINNUM:
1808 case TargetOpcode::G_FMAXNUM:
1809 case TargetOpcode::G_FMINNUM_IEEE:
1810 case TargetOpcode::G_FMAXNUM_IEEE:
1812 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1814 case TargetOpcode::G_INSERT_VECTOR_ELT:
1816 case TargetOpcode::G_FSIN:
1817 case TargetOpcode::G_FCOS:
1819 case TargetOpcode::G_GLOBAL_VALUE:
1821 case TargetOpcode::G_LOAD:
1822 case TargetOpcode::G_SEXTLOAD:
1823 case TargetOpcode::G_ZEXTLOAD:
1825 case TargetOpcode::G_FMAD:
1827 case TargetOpcode::G_FDIV:
1829 case TargetOpcode::G_UDIV:
1830 case TargetOpcode::G_UREM:
1831 case TargetOpcode::G_UDIVREM:
1833 case TargetOpcode::G_SDIV:
1834 case TargetOpcode::G_SREM:
1835 case TargetOpcode::G_SDIVREM:
1837 case TargetOpcode::G_ATOMIC_CMPXCHG:
1839 case TargetOpcode::G_FLOG:
1841 case TargetOpcode::G_FLOG10:
1843 case TargetOpcode::G_FEXP:
1845 case TargetOpcode::G_FPOW:
1847 case TargetOpcode::G_FFLOOR:
1849 case TargetOpcode::G_BUILD_VECTOR:
1850 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1852 case TargetOpcode::G_MUL:
1854 case TargetOpcode::G_CTLZ:
1855 case TargetOpcode::G_CTTZ:
1857 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1877 if (ST.hasApertureRegs()) {
1882 ? AMDGPU::SRC_SHARED_BASE
1883 : AMDGPU::SRC_PRIVATE_BASE;
1891 Register Dst =
MRI.createGenericVirtualRegister(S64);
1892 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1893 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
1894 return B.buildUnmerge(S32, Dst).getReg(1);
1899 Register LoadAddr =
MRI.createGenericVirtualRegister(
1909 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
1911 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
1925 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1928 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1931 Register QueuePtr =
MRI.createGenericVirtualRegister(
1947 B.buildPtrAdd(LoadAddr, QueuePtr,
1948 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
1949 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1957 switch (Def->getOpcode()) {
1958 case AMDGPU::G_FRAME_INDEX:
1959 case AMDGPU::G_GLOBAL_VALUE:
1960 case AMDGPU::G_BLOCK_ADDR:
1962 case AMDGPU::G_CONSTANT: {
1963 const ConstantInt *CI = Def->getOperand(1).getCImm();
1982 LLT DstTy =
MRI.getType(Dst);
1983 LLT SrcTy =
MRI.getType(Src);
1994 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1995 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2004 B.buildExtract(Dst, Src, 0);
2005 MI.eraseFromParent();
2009 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2011 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2012 auto FlatNull =
B.buildConstant(SrcTy, 0);
2015 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2019 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2021 MI.eraseFromParent();
2033 Register SrcAsInt =
B.buildPtrToInt(S32, Src).getReg(0);
2037 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2040 B.buildCopy(Dst, BuildPtr);
2041 MI.eraseFromParent();
2045 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2046 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2049 SegmentNull.getReg(0));
2051 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2053 MI.eraseFromParent();
2060 B.buildExtract(Dst, Src, 0);
2061 MI.eraseFromParent();
2069 auto PtrLo =
B.buildPtrToInt(S32, Src);
2070 auto HighAddr =
B.buildConstant(S32, AddrHiVal);
2071 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2072 MI.eraseFromParent();
2077 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2080 Ctx.
diagnose(InvalidAddrSpaceCast);
2082 MI.eraseFromParent();
2090 LLT Ty =
MRI.getType(Src);
2096 auto C1 =
B.buildFConstant(Ty, C1Val);
2097 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2100 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2101 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2103 auto C2 =
B.buildFConstant(Ty, C2Val);
2104 auto Fabs =
B.buildFAbs(Ty, Src);
2107 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2108 MI.eraseFromParent();
2126 auto Trunc =
B.buildIntrinsicTrunc(S64, Src);
2128 const auto Zero =
B.buildFConstant(S64, 0.0);
2129 const auto One =
B.buildFConstant(S64, 1.0);
2132 auto And =
B.buildAnd(S1, Lt0, NeTrunc);
2133 auto Add =
B.buildSelect(S64,
And, One, Zero);
2136 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2137 MI.eraseFromParent();
2145 Register Src0Reg =
MI.getOperand(1).getReg();
2146 Register Src1Reg =
MI.getOperand(2).getReg();
2148 LLT Ty =
MRI.getType(DstReg);
2150 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg,
Flags);
2151 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div,
Flags);
2152 auto Neg =
B.buildFNeg(Ty, Trunc,
Flags);
2153 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg,
Flags);
2154 MI.eraseFromParent();
2160 const unsigned FractBits = 52;
2161 const unsigned ExpBits = 11;
2164 auto Const0 =
B.buildConstant(S32, FractBits - 32);
2165 auto Const1 =
B.buildConstant(S32, ExpBits);
2167 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32},
false)
2169 .addUse(Const0.getReg(0))
2170 .addUse(Const1.getReg(0));
2172 return B.buildSub(S32, ExpPart,
B.buildConstant(S32, 1023));
2186 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2193 const unsigned FractBits = 52;
2196 const auto SignBitMask =
B.buildConstant(S32, UINT32_C(1) << 31);
2197 auto SignBit =
B.buildAnd(S32,
Hi, SignBitMask);
2199 const auto FractMask =
B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2201 const auto Zero32 =
B.buildConstant(S32, 0);
2204 auto SignBit64 =
B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2206 auto Shr =
B.buildAShr(S64, FractMask, Exp);
2207 auto Not =
B.buildNot(S64, Shr);
2208 auto Tmp0 =
B.buildAnd(S64, Src, Not);
2209 auto FiftyOne =
B.buildConstant(S32, FractBits - 1);
2214 auto Tmp1 =
B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2215 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2216 MI.eraseFromParent();
2232 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2233 auto ThirtyTwo =
B.buildConstant(S32, 32);
2235 if (
MRI.getType(Dst) == S64) {
2236 auto CvtHi =
Signed ?
B.buildSITOFP(S64, Unmerge.getReg(1))
2237 :
B.buildUITOFP(S64, Unmerge.getReg(1));
2239 auto CvtLo =
B.buildUITOFP(S64, Unmerge.getReg(0));
2240 auto LdExp =
B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64},
false)
2241 .addUse(CvtHi.getReg(0))
2242 .addUse(ThirtyTwo.getReg(0));
2245 B.buildFAdd(Dst, LdExp, CvtLo);
2246 MI.eraseFromParent();
2252 auto One =
B.buildConstant(S32, 1);
2256 auto ThirtyOne =
B.buildConstant(S32, 31);
2257 auto X =
B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2258 auto OppositeSign =
B.buildAShr(S32,
X, ThirtyOne);
2259 auto MaxShAmt =
B.buildAdd(S32, ThirtyTwo, OppositeSign);
2260 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2262 .addUse(Unmerge.getReg(1));
2263 auto LS2 =
B.buildSub(S32, LS, One);
2264 ShAmt =
B.buildUMin(S32, LS2, MaxShAmt);
2266 ShAmt =
B.buildCTLZ(S32, Unmerge.getReg(1));
2267 auto Norm =
B.buildShl(S64, Src, ShAmt);
2268 auto Unmerge2 =
B.buildUnmerge({S32, S32}, Norm);
2269 auto Adjust =
B.buildUMin(S32, One, Unmerge2.getReg(0));
2270 auto Norm2 =
B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2271 auto FVal =
Signed ?
B.buildSITOFP(S32, Norm2) :
B.buildUITOFP(S32, Norm2);
2272 auto Scale =
B.buildSub(S32, ThirtyTwo, ShAmt);
2275 .addUse(FVal.getReg(0))
2276 .addUse(Scale.getReg(0));
2277 MI.eraseFromParent();
2294 const LLT SrcLT =
MRI.getType(Src);
2295 assert((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64);
2297 unsigned Flags =
MI.getFlags();
2308 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src,
Flags);
2310 if (
Signed && SrcLT == S32) {
2316 Sign =
B.buildAShr(S32, Src,
B.buildConstant(S32, 31));
2317 Trunc =
B.buildFAbs(S32, Trunc,
Flags);
2321 K0 =
B.buildFConstant(
2322 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2323 K1 =
B.buildFConstant(
2324 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2326 K0 =
B.buildFConstant(
2327 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2328 K1 =
B.buildFConstant(
2329 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2332 auto Mul =
B.buildFMul(SrcLT, Trunc, K0,
Flags);
2333 auto FloorMul =
B.buildFFloor(SrcLT,
Mul,
Flags);
2334 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc,
Flags);
2336 auto Hi = (
Signed && SrcLT == S64) ?
B.buildFPTOSI(S32, FloorMul)
2337 :
B.buildFPTOUI(S32, FloorMul);
2338 auto Lo =
B.buildFPTOUI(S32, Fma);
2340 if (
Signed && SrcLT == S32) {
2342 Sign =
B.buildMergeLikeInstr(S64, {Sign, Sign});
2344 B.buildSub(Dst,
B.buildXor(S64,
B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2347 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2348 MI.eraseFromParent();
2358 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2359 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2382 std::optional<ValueAndVReg> MaybeIdxVal =
2386 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2391 LLT VecTy =
MRI.getType(Vec);
2396 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2397 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2402 MI.eraseFromParent();
2416 std::optional<ValueAndVReg> MaybeIdxVal =
2421 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2426 LLT VecTy =
MRI.getType(Vec);
2432 if (IdxVal < NumElts) {
2434 for (
unsigned i = 0; i < NumElts; ++i)
2435 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2436 B.buildUnmerge(SrcRegs, Vec);
2438 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2439 B.buildMergeLikeInstr(Dst, SrcRegs);
2444 MI.eraseFromParent();
2454 LLT Ty =
MRI.getType(DstReg);
2455 unsigned Flags =
MI.getFlags();
2460 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi,
Flags);
2461 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty},
false)
2462 .addUse(MulVal.getReg(0))
2463 .setMIFlags(
Flags).getReg(0);
2465 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi,
Flags).getReg(0);
2468 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2472 MI.eraseFromParent();
2480 unsigned GAFlags)
const {
2481 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2518 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2529 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2530 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2533 B.buildExtract(DstReg, PCReg, 0);
2541 LLT Ty =
MRI.getType(DstReg);
2553 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2563 B.buildUndef(DstReg);
2564 MI.eraseFromParent();
2584 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2589 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32},
false);
2590 B.buildIntToPtr(DstReg, Sz);
2591 MI.eraseFromParent();
2597 *cast<GlobalVariable>(GV)));
2598 MI.eraseFromParent();
2606 MI.eraseFromParent();
2612 MI.eraseFromParent();
2617 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
2630 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2631 B.buildExtract(DstReg, Load, 0);
2633 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2635 MI.eraseFromParent();
2653 LLT PtrTy =
MRI.getType(PtrReg);
2658 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
2660 MI.getOperand(1).setReg(Cast.getReg(0));
2665 if (
MI.getOpcode() != AMDGPU::G_LOAD)
2669 LLT ValTy =
MRI.getType(ValReg);
2684 if (WideMemSize == ValSize) {
2690 MI.setMemRefs(MF, {WideMMO});
2696 if (ValSize > WideMemSize)
2703 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2704 B.buildTrunc(ValReg, WideLoad).getReg(0);
2711 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2712 B.buildExtract(ValReg, WideLoad, 0);
2716 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2717 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2721 MI.eraseFromParent();
2731 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
2758 "this should not have been custom lowered");
2760 LLT ValTy =
MRI.getType(CmpVal);
2763 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
2765 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2769 .setMemRefs(
MI.memoperands());
2771 MI.eraseFromParent();
2779 LLT Ty =
B.getMRI()->getType(Dst);
2780 unsigned Flags =
MI.getFlags();
2782 auto Log2Operand =
B.buildFLog2(Ty, Src,
Flags);
2783 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
2785 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand,
Flags);
2786 MI.eraseFromParent();
2794 unsigned Flags =
MI.getFlags();
2795 LLT Ty =
B.getMRI()->getType(Dst);
2798 auto Mul =
B.buildFMul(Ty, Src, K,
Flags);
2800 MI.eraseFromParent();
2809 unsigned Flags =
MI.getFlags();
2810 LLT Ty =
B.getMRI()->getType(Dst);
2815 auto Log =
B.buildFLog2(S32, Src0,
Flags);
2816 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2817 .addUse(Log.getReg(0))
2821 }
else if (Ty == S16) {
2823 auto Log =
B.buildFLog2(S16, Src0,
Flags);
2824 auto Ext0 =
B.buildFPExt(S32, Log,
Flags);
2825 auto Ext1 =
B.buildFPExt(S32, Src1,
Flags);
2826 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2827 .addUse(Ext0.getReg(0))
2828 .addUse(Ext1.getReg(0))
2831 B.buildFExp2(Dst,
B.buildFPTrunc(S16,
Mul),
Flags);
2835 MI.eraseFromParent();
2843 ModSrc = SrcFNeg->getOperand(1).getReg();
2845 ModSrc = SrcFAbs->getOperand(1).getReg();
2847 ModSrc = SrcFAbs->getOperand(1).getReg();
2858 Register OrigSrc =
MI.getOperand(1).getReg();
2859 unsigned Flags =
MI.getFlags();
2861 "this should not have been custom lowered");
2871 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64},
false)
2883 B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
2885 Register Min =
MRI.createGenericVirtualRegister(S64);
2891 B.buildFMinNumIEEE(Min, Fract, Const,
Flags);
2893 B.buildFMinNum(Min, Fract, Const,
Flags);
2898 CorrectedFract =
B.buildSelect(S64, IsNan, ModSrc, Min,
Flags).getReg(0);
2901 auto NegFract =
B.buildFNeg(S64, CorrectedFract,
Flags);
2902 B.buildFAdd(Dst, OrigSrc, NegFract,
Flags);
2904 MI.eraseFromParent();
2920 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2922 Src0 =
B.buildTrunc(S16,
MI.getOperand(1).getReg()).getReg(0);
2923 Src1 =
B.buildTrunc(S16,
MI.getOperand(2).getReg()).getReg(0);
2926 auto Merge =
B.buildMergeLikeInstr(S32, {Src0, Src1});
2927 B.buildBitcast(Dst,
Merge);
2929 MI.eraseFromParent();
2946 bool UsePartialMad64_32,
2947 bool SeparateOddAlignedProducts)
const {
2962 auto getZero32 = [&]() ->
Register {
2964 Zero32 =
B.buildConstant(S32, 0).getReg(0);
2967 auto getZero64 = [&]() ->
Register {
2969 Zero64 =
B.buildConstant(S64, 0).getReg(0);
2974 for (
unsigned i = 0; i < Src0.
size(); ++i) {
2985 if (CarryIn.empty())
2988 bool HaveCarryOut =
true;
2990 if (CarryIn.size() == 1) {
2992 LocalAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2996 CarryAccum = getZero32();
2998 CarryAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2999 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3001 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3006 LocalAccum = getZero32();
3007 HaveCarryOut =
false;
3012 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3013 LocalAccum =
Add.getReg(0);
3027 auto buildMadChain =
3030 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3031 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3038 if (LocalAccum.size() == 1 &&
3039 (!UsePartialMad64_32 || !CarryIn.empty())) {
3042 unsigned j1 = DstIndex - j0;
3043 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3047 auto Mul =
B.buildMul(S32, Src0[j0], Src1[j1]);
3049 LocalAccum[0] =
Mul.getReg(0);
3051 if (CarryIn.empty()) {
3052 LocalAccum[0] =
B.buildAdd(S32, LocalAccum[0],
Mul).getReg(0);
3055 B.buildUAdde(S32, S1, LocalAccum[0],
Mul, CarryIn.back())
3061 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3065 if (j0 <= DstIndex) {
3066 bool HaveSmallAccum =
false;
3069 if (LocalAccum[0]) {
3070 if (LocalAccum.size() == 1) {
3071 Tmp =
B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3072 HaveSmallAccum =
true;
3073 }
else if (LocalAccum[1]) {
3074 Tmp =
B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3075 HaveSmallAccum =
false;
3077 Tmp =
B.buildZExt(S64, LocalAccum[0]).getReg(0);
3078 HaveSmallAccum =
true;
3081 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3083 HaveSmallAccum =
true;
3087 unsigned j1 = DstIndex - j0;
3088 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3092 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3093 {Src0[j0], Src1[j1], Tmp});
3094 Tmp = Mad.getReg(0);
3095 if (!HaveSmallAccum)
3096 CarryOut.push_back(Mad.getReg(1));
3097 HaveSmallAccum =
false;
3100 }
while (j0 <= DstIndex);
3102 auto Unmerge =
B.buildUnmerge(S32, Tmp);
3103 LocalAccum[0] = Unmerge.getReg(0);
3104 if (LocalAccum.size() > 1)
3105 LocalAccum[1] = Unmerge.getReg(1);
3132 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
3133 Carry OddCarryIn = std::move(OddCarry);
3134 Carry EvenCarryIn = std::move(EvenCarry);
3139 if (2 * i < Accum.
size()) {
3140 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
3141 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3146 if (!SeparateOddAlignedProducts) {
3147 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
3148 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3150 bool IsHighest = 2 * i >= Accum.
size();
3154 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3160 Lo =
B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3162 Lo =
B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3164 Lo =
B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3167 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
3170 auto Hi =
B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3171 Lo->getOperand(1).getReg());
3172 Accum[2 * i] =
Hi.getReg(0);
3173 SeparateOddCarry =
Hi.getReg(1);
3180 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3181 EvenCarryIn.push_back(CarryOut);
3183 if (2 * i < Accum.
size()) {
3184 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3185 OddCarry.push_back(CarryOut);
3198 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
3207 LLT Ty =
MRI.getType(DstReg);
3211 unsigned NumParts =
Size / 32;
3227 for (
unsigned i = 0; i < NumParts; ++i) {
3228 Src0Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
3229 Src1Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
3231 B.buildUnmerge(Src0Parts, Src0);
3232 B.buildUnmerge(Src1Parts, Src1);
3235 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3236 SeparateOddAlignedProducts);
3238 B.buildMergeLikeInstr(DstReg, AccumRegs);
3239 MI.eraseFromParent();
3251 LLT DstTy =
MRI.getType(Dst);
3252 LLT SrcTy =
MRI.getType(Src);
3254 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
3255 ? AMDGPU::G_AMDGPU_FFBH_U32
3256 : AMDGPU::G_AMDGPU_FFBL_B32;
3257 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
3260 MI.eraseFromParent();
3266 if (
MI.getOpcode() != TargetOpcode::G_XOR)
3269 return ConstVal && *ConstVal == -1;
3276 Register CondDef =
MI.getOperand(0).getReg();
3277 if (!
MRI.hasOneNonDBGUse(CondDef))
3285 if (!
MRI.hasOneNonDBGUse(NegatedCond))
3291 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
3300 if (Next == Parent->
end()) {
3304 UncondBrTarget = &*NextMBB;
3306 if (Next->getOpcode() != AMDGPU::G_BR)
3324 *ArgRC,
B.getDebugLoc(), ArgTy);
3325 if (
Arg->isMasked()) {
3328 const unsigned Mask =
Arg->getMask();
3329 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
3336 auto ShiftAmt =
B.buildConstant(S32, Shift);
3337 AndMaskSrc =
B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3340 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(S32, Mask >> Shift));
3342 B.buildCopy(DstReg, LiveIn);
3361 B.buildConstant(DstReg, 0);
3367 B.buildUndef(DstReg);
3371 if (!
Arg->isRegister() || !
Arg->getRegister().isValid())
3382 MI.eraseFromParent();
3388 B.buildConstant(
MI.getOperand(0).getReg(),
C);
3389 MI.eraseFromParent();
3410 B.buildUndef(DstReg);
3411 MI.eraseFromParent();
3415 if (
Arg->isMasked()) {
3429 MI.eraseFromParent();
3436 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
3446 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3454 Align Alignment)
const {
3458 "unexpected kernarg parameter type");
3462 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
3465 MI.eraseFromParent();
3473 LLT DstTy =
MRI.getType(Dst);
3500 auto FloatY =
B.buildUITOFP(S32,
Y);
3501 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3502 auto Scale =
B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
3503 auto ScaledY =
B.buildFMul(S32, RcpIFlag, Scale);
3504 auto Z =
B.buildFPTOUI(S32, ScaledY);
3507 auto NegY =
B.buildSub(S32,
B.buildConstant(S32, 0),
Y);
3508 auto NegYZ =
B.buildMul(S32, NegY, Z);
3509 Z =
B.buildAdd(S32, Z,
B.buildUMulH(S32, Z, NegYZ));
3512 auto Q =
B.buildUMulH(S32,
X, Z);
3513 auto R =
B.buildSub(S32,
X,
B.buildMul(S32, Q,
Y));
3516 auto One =
B.buildConstant(S32, 1);
3519 Q =
B.buildSelect(S32,
Cond,
B.buildAdd(S32, Q, One), Q);
3520 R =
B.buildSelect(S32,
Cond,
B.buildSub(S32, R,
Y), R);
3525 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(S32, Q, One), Q);
3528 B.buildSelect(DstRemReg,
Cond,
B.buildSub(S32, R,
Y), R);
3547 auto Unmerge =
B.buildUnmerge(S32, Val);
3549 auto CvtLo =
B.buildUITOFP(S32, Unmerge.getReg(0));
3550 auto CvtHi =
B.buildUITOFP(S32, Unmerge.getReg(1));
3552 auto Mad =
B.buildFMAD(
3554 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
3556 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3557 auto Mul1 =
B.buildFMul(
3558 S32, Rcp,
B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
3561 auto Mul2 =
B.buildFMul(
3562 S32, Mul1,
B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
3563 auto Trunc =
B.buildIntrinsicTrunc(S32, Mul2);
3566 auto Mad2 =
B.buildFMAD(
3567 S32, Trunc,
B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
3570 auto ResultLo =
B.buildFPTOUI(S32, Mad2);
3571 auto ResultHi =
B.buildFPTOUI(S32, Trunc);
3573 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3588 auto Rcp =
B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3590 auto Zero64 =
B.buildConstant(S64, 0);
3591 auto NegDenom =
B.buildSub(S64, Zero64, Denom);
3593 auto MulLo1 =
B.buildMul(S64, NegDenom, Rcp);
3594 auto MulHi1 =
B.buildUMulH(S64, Rcp, MulLo1);
3596 auto UnmergeMulHi1 =
B.buildUnmerge(S32, MulHi1);
3597 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3598 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3600 auto Add1_Lo =
B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3601 auto Add1_Hi =
B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3602 auto Add1 =
B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3604 auto MulLo2 =
B.buildMul(S64, NegDenom, Add1);
3605 auto MulHi2 =
B.buildUMulH(S64, Add1, MulLo2);
3606 auto UnmergeMulHi2 =
B.buildUnmerge(S32, MulHi2);
3607 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3608 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3610 auto Zero32 =
B.buildConstant(S32, 0);
3611 auto Add2_Lo =
B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3612 auto Add2_Hi =
B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3613 auto Add2 =
B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3615 auto UnmergeNumer =
B.buildUnmerge(S32, Numer);
3616 Register NumerLo = UnmergeNumer.getReg(0);
3617 Register NumerHi = UnmergeNumer.getReg(1);
3619 auto MulHi3 =
B.buildUMulH(S64, Numer, Add2);
3620 auto Mul3 =
B.buildMul(S64, Denom, MulHi3);
3621 auto UnmergeMul3 =
B.buildUnmerge(S32, Mul3);
3622 Register Mul3_Lo = UnmergeMul3.getReg(0);
3623 Register Mul3_Hi = UnmergeMul3.getReg(1);
3624 auto Sub1_Lo =
B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3625 auto Sub1_Hi =
B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3626 auto Sub1_Mi =
B.buildSub(S32, NumerHi, Mul3_Hi);
3627 auto Sub1 =
B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3629 auto UnmergeDenom =
B.buildUnmerge(S32, Denom);
3630 Register DenomLo = UnmergeDenom.getReg(0);
3631 Register DenomHi = UnmergeDenom.getReg(1);
3634 auto C1 =
B.buildSExt(S32, CmpHi);
3637 auto C2 =
B.buildSExt(S32, CmpLo);
3640 auto C3 =
B.buildSelect(S32, CmpEq, C2, C1);
3647 auto Sub2_Lo =
B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3648 auto Sub2_Mi =
B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3649 auto Sub2_Hi =
B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3650 auto Sub2 =
B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3652 auto One64 =
B.buildConstant(S64, 1);
3653 auto Add3 =
B.buildAdd(S64, MulHi3, One64);
3659 auto C6 =
B.buildSelect(
3663 auto Add4 =
B.buildAdd(S64, Add3, One64);
3664 auto Sub3_Lo =
B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3666 auto Sub3_Mi =
B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3667 auto Sub3_Hi =
B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3668 auto Sub3 =
B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3674 auto Sel1 =
B.buildSelect(
3681 auto Sel2 =
B.buildSelect(
3692 switch (
MI.getOpcode()) {
3695 case AMDGPU::G_UDIV: {
3696 DstDivReg =
MI.getOperand(0).getReg();
3699 case AMDGPU::G_UREM: {
3700 DstRemReg =
MI.getOperand(0).getReg();
3703 case AMDGPU::G_UDIVREM: {
3704 DstDivReg =
MI.getOperand(0).getReg();
3705 DstRemReg =
MI.getOperand(1).getReg();
3712 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3713 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
3714 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
3715 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3724 MI.eraseFromParent();
3734 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3735 if (Ty != S32 && Ty != S64)
3738 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3742 auto SignBitOffset =
B.buildConstant(S32, Ty.
getSizeInBits() - 1);
3743 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
3744 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
3746 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
3747 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
3749 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
3750 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
3752 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3753 switch (
MI.getOpcode()) {
3756 case AMDGPU::G_SDIV: {
3757 DstDivReg =
MI.getOperand(0).getReg();
3758 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
3761 case AMDGPU::G_SREM: {
3762 DstRemReg =
MI.getOperand(0).getReg();
3763 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
3766 case AMDGPU::G_SDIVREM: {
3767 DstDivReg =
MI.getOperand(0).getReg();
3768 DstRemReg =
MI.getOperand(1).getReg();
3769 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
3770 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
3781 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
3782 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3783 B.buildSub(DstDivReg, SignXor, Sign);
3787 auto Sign = LHSign.getReg(0);
3788 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3789 B.buildSub(DstRemReg, SignXor, Sign);
3792 MI.eraseFromParent();
3803 LLT ResTy =
MRI.getType(Res);
3809 if (!AllowInaccurateRcp)
3814 if (CLHS->isExactlyValue(1.0)) {
3815 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3819 MI.eraseFromParent();
3824 if (CLHS->isExactlyValue(-1.0)) {
3825 auto FNeg =
B.buildFNeg(ResTy,
RHS,
Flags);
3826 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3827 .addUse(FNeg.getReg(0))
3830 MI.eraseFromParent();
3836 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3841 MI.eraseFromParent();
3852 LLT ResTy =
MRI.getType(Res);
3858 if (!AllowInaccurateRcp)
3861 auto NegY =
B.buildFNeg(ResTy,
Y);
3862 auto One =
B.buildFConstant(ResTy, 1.0);
3864 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3868 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
3869 R =
B.buildFMA(ResTy, Tmp0, R, R);
3871 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
3872 R =
B.buildFMA(ResTy, Tmp1, R, R);
3874 auto Ret =
B.buildFMul(ResTy,
X, R);
3875 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
3877 B.buildFMA(Res, Tmp2, R, Ret);
3878 MI.eraseFromParent();
3897 auto LHSExt =
B.buildFPExt(S32,
LHS,
Flags);
3898 auto RHSExt =
B.buildFPExt(S32,
RHS,
Flags);
3900 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3901 .addUse(RHSExt.getReg(0))
3904 auto QUOT =
B.buildFMul(S32, LHSExt, RCP,
Flags);
3905 auto RDst =
B.buildFPTrunc(S16, QUOT,
Flags);
3907 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3908 .addUse(RDst.getReg(0))
3913 MI.eraseFromParent();
3923 unsigned SPDenormMode =
3926 if (ST.hasDenormModeInst()) {
3928 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3930 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3931 B.buildInstr(AMDGPU::S_DENORM_MODE)
3932 .addImm(NewDenormModeValue);
3940 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3941 .addImm(SPDenormMode)
3942 .addImm(SPDenormModeBitField);
3963 auto One =
B.buildFConstant(S32, 1.0f);
3965 auto DenominatorScaled =
3966 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3971 auto NumeratorScaled =
3972 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3978 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3979 .addUse(DenominatorScaled.getReg(0))
3981 auto NegDivScale0 =
B.buildFNeg(S32, DenominatorScaled,
Flags);
3985 if (!Mode.allFP32Denormals())
3988 auto Fma0 =
B.buildFMA(S32, NegDivScale0, ApproxRcp, One,
Flags);
3989 auto Fma1 =
B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp,
Flags);
3990 auto Mul =
B.buildFMul(S32, NumeratorScaled, Fma1,
Flags);
3991 auto Fma2 =
B.buildFMA(S32, NegDivScale0,
Mul, NumeratorScaled,
Flags);
3992 auto Fma3 =
B.buildFMA(S32, Fma2, Fma1,
Mul,
Flags);
3993 auto Fma4 =
B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled,
Flags);
3995 if (!Mode.allFP32Denormals())
3998 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32},
false)
3999 .addUse(Fma4.getReg(0))
4000 .addUse(Fma1.getReg(0))
4001 .addUse(Fma3.getReg(0))
4002 .addUse(NumeratorScaled.getReg(1))
4005 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
4006 .addUse(Fmas.getReg(0))
4011 MI.eraseFromParent();
4030 auto One =
B.buildFConstant(S64, 1.0);
4032 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
4038 auto NegDivScale0 =
B.buildFNeg(S64, DivScale0.getReg(0),
Flags);
4040 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64},
false)
4041 .addUse(DivScale0.getReg(0))
4044 auto Fma0 =
B.buildFMA(S64, NegDivScale0, Rcp, One,
Flags);
4045 auto Fma1 =
B.buildFMA(S64, Rcp, Fma0, Rcp,
Flags);
4046 auto Fma2 =
B.buildFMA(S64, NegDivScale0, Fma1, One,
Flags);
4048 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
4054 auto Fma3 =
B.buildFMA(S64, Fma1, Fma2, Fma1,
Flags);
4055 auto Mul =
B.buildFMul(S64, DivScale1.getReg(0), Fma3,
Flags);
4056 auto Fma4 =
B.buildFMA(S64, NegDivScale0,
Mul, DivScale1.getReg(0),
Flags);
4065 auto NumUnmerge =
B.buildUnmerge(S32,
LHS);
4066 auto DenUnmerge =
B.buildUnmerge(S32,
RHS);
4067 auto Scale0Unmerge =
B.buildUnmerge(S32, DivScale0);
4068 auto Scale1Unmerge =
B.buildUnmerge(S32, DivScale1);
4071 Scale1Unmerge.getReg(1));
4073 Scale0Unmerge.getReg(1));
4074 Scale =
B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4076 Scale = DivScale1.getReg(1);
4079 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64},
false)
4080 .addUse(Fma4.getReg(0))
4081 .addUse(Fma3.getReg(0))
4082 .addUse(
Mul.getReg(0))
4086 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res),
false)
4087 .addUse(Fmas.getReg(0))
4092 MI.eraseFromParent();
4107 auto Abs =
B.buildFAbs(S32,
RHS,
Flags);
4110 auto C0 =
B.buildConstant(S32, 0x6f800000);
4111 auto C1 =
B.buildConstant(S32, 0x2f800000);
4112 auto C2 =
B.buildConstant(S32, llvm::bit_cast<uint32_t>(1.0f));
4115 auto Sel =
B.buildSelect(S32, CmpRes, C1, C2,
Flags);
4117 auto Mul0 =
B.buildFMul(S32,
RHS, Sel,
Flags);
4119 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
4120 .addUse(Mul0.getReg(0))
4123 auto Mul1 =
B.buildFMul(S32,
LHS, RCP,
Flags);
4125 B.buildFMul(Res, Sel, Mul1,
Flags);
4127 MI.eraseFromParent();
4147 LLT Ty =
MRI.getType(Dst);
4157 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty},
false)
4167 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt,
Flags) :
4168 B.buildFMinNum(Ty, Rsq, MaxFlt,
Flags);
4173 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt,
Flags);
4175 B.buildFMaxNum(Dst, ClampMax, MinFlt,
Flags);
4176 MI.eraseFromParent();
4182 case Intrinsic::amdgcn_ds_fadd:
4183 return AMDGPU::G_ATOMICRMW_FADD;
4184 case Intrinsic::amdgcn_ds_fmin:
4185 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4186 case Intrinsic::amdgcn_ds_fmax:
4187 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4203 for (
int I = 6;
I > 3; --
I)
4204 MI.removeOperand(
I);
4206 MI.removeOperand(1);
4217 LLT DstTy =
MRI.getType(DstReg);
4220 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
4226 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
4243 MI.eraseFromParent();
4251 std::optional<uint32_t> KnownSize =
4253 if (KnownSize.has_value())
4254 B.buildConstant(DstReg, *KnownSize);
4272 MI.eraseFromParent();
4279 unsigned AddrSpace)
const {
4281 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
4285 MI.eraseFromParent();
4295std::pair<Register, unsigned>
4304 std::tie(BaseReg, ImmOffset) =
4308 if (
MRI.getType(BaseReg).isPointer())
4309 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
4319 unsigned Overflow = ImmOffset & ~MaxImm;
4320 ImmOffset -= Overflow;
4321 if ((int32_t)Overflow < 0) {
4322 Overflow += ImmOffset;
4326 if (Overflow != 0) {
4328 BaseReg =
B.buildConstant(S32, Overflow).getReg(0);
4330 auto OverflowVal =
B.buildConstant(S32, Overflow);
4331 BaseReg =
B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4336 BaseReg =
B.buildConstant(S32, 0).getReg(0);
4338 return std::pair(BaseReg, ImmOffset);
4344 unsigned ImmOffset,
Register VIndex,
4346 std::optional<ValueAndVReg> MaybeVOffsetVal =
4348 std::optional<ValueAndVReg> MaybeSOffsetVal =
4350 std::optional<ValueAndVReg> MaybeVIndexVal =
4355 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4356 MaybeVIndexVal->Value == 0) {
4357 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4358 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4370 bool ImageStore)
const {
4373 LLT StoreVT =
MRI.getType(Reg);
4377 auto Unmerge =
B.buildUnmerge(S16, Reg);
4380 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4381 WideRegs.
push_back(
B.buildAnyExt(S32, Unmerge.getReg(
I)).getReg(0));
4392 Reg =
B.buildBitcast(S32, Reg).getReg(0);
4394 PackedRegs.
resize(2,
B.buildUndef(S32).getReg(0));
4401 auto Unmerge =
B.buildUnmerge(S16, Reg);
4402 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4404 PackedRegs.
resize(6,
B.buildUndef(S16).getReg(0));
4412 auto Unmerge =
B.buildUnmerge(S32, Reg);
4413 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4415 PackedRegs.
resize(4,
B.buildUndef(S32).getReg(0));
4433 LLT Ty =
MRI->getType(VData);
4457 bool IsFormat)
const {
4459 LLT Ty =
MRI.getType(VData);
4461 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4468 const int MemSize = MMO->
getSize();
4473 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4476 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4480 VIndex =
MI.getOperand(3).getReg();
4483 VIndex =
B.buildConstant(S32, 0).getReg(0);
4486 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4487 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4491 Format =
MI.getOperand(5 + OpOffset).getImm();
4495 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4502 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4503 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4504 }
else if (IsFormat) {
4505 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4506 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4510 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4513 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4516 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4521 auto MIB =
B.buildInstr(Opc)
4532 MIB.addImm(AuxiliaryData)
4533 .addImm(HasVIndex ? -1 : 0)
4534 .addMemOperand(MMO);
4536 MI.eraseFromParent();
4542 unsigned ImmOffset,
unsigned Format,
4545 auto MIB =
B.buildInstr(Opc)
4556 MIB.addImm(AuxiliaryData)
4557 .addImm(HasVIndex ? -1 : 0)
4558 .addMemOperand(MMO);
4565 bool IsTyped)
const {
4575 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
4576 bool IsTFE =
MI.getNumExplicitDefs() == 2;
4578 StatusDst =
MI.getOperand(1).getReg();
4582 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
4585 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4588 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
4591 VIndex =
MI.getOperand(3 + OpOffset).getReg();
4594 VIndex =
B.buildConstant(S32, 0).getReg(0);
4597 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4598 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4602 Format =
MI.getOperand(5 + OpOffset).getImm();
4606 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4609 LLT Ty =
MRI.getType(Dst);
4611 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4623 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4624 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4625 }
else if (IsFormat) {
4629 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4631 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4632 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4639 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4642 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4645 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4652 unsigned NumLoadDWords = NumValueDWords + 1;
4654 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
4655 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4656 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4657 if (NumValueDWords == 1) {
4658 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4661 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
4662 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(S32));
4664 B.buildUnmerge(LoadElts, LoadDstReg);
4666 B.buildMergeLikeInstr(Dst, LoadElts);
4670 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(S32);
4671 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4672 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4673 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
4674 B.buildTrunc(Dst, LoadDstReg);
4675 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
4677 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4678 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4679 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4680 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
4682 auto Unmerge =
B.buildUnmerge(S32, LoadDstReg);
4684 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
4685 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
4686 B.buildMergeLikeInstr(Dst, Repack);
4689 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4692 MI.eraseFromParent();
4699 unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP :
4700 AMDGPU::G_ATOMICRMW_UDEC_WRAP;
4702 .addDef(
MI.getOperand(0).getReg())
4703 .addUse(
MI.getOperand(2).getReg())
4704 .addUse(
MI.getOperand(3).getReg())
4706 MI.eraseFromParent();
4712 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4713 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4714 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4715 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4716 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4717 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4718 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4719 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4720 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4721 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4722 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4723 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4724 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4725 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4726 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4727 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4728 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4729 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4730 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4731 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4732 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4733 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4734 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4735 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4736 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4737 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4738 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4739 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4740 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4741 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4742 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4743 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4744 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4745 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4746 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4747 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4748 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4749 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4750 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4751 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4752 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4753 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4754 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4755 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4756 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4757 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4758 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4759 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4768 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4769 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4770 const bool HasReturn =
MI.getNumExplicitDefs() != 0;
4777 Dst =
MI.getOperand(0).getReg();
4782 Register VData =
MI.getOperand(2 + OpOffset).getReg();
4786 CmpVal =
MI.getOperand(3 + OpOffset).getReg();
4790 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
4791 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4794 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4797 VIndex =
MI.getOperand(4 + OpOffset).getReg();
4803 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
4804 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
4805 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
4828 .addImm(AuxiliaryData)
4829 .addImm(HasVIndex ? -1 : 0)
4830 .addMemOperand(MMO);
4832 MI.eraseFromParent();
4842 bool IsA16,
bool IsG16) {
4845 auto EndIdx =
Intr->VAddrEnd;
4847 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
4854 if ((I < Intr->GradientStart) ||
4855 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4856 (
I >=
Intr->CoordStart && !IsA16)) {
4857 if ((I < Intr->GradientStart) && IsA16 &&
4858 (
B.getMRI()->getType(AddrReg) == S16)) {
4859 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
4863 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4867 "Bias needs to be converted to 16 bit in A16 mode");
4869 AddrReg =
B.buildBitcast(V2S16, AddrReg).getReg(0);
4875 if (((
I + 1) >= EndIdx) ||
4876 ((
Intr->NumGradients / 2) % 2 == 1 &&
4877 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
4878 (
Intr->NumGradients / 2) - 1) ||
4879 I ==
static_cast<unsigned>(
Intr->GradientStart +
4880 Intr->NumGradients - 1))) ||
4882 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {