28#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include "llvm/IR/IntrinsicsR600.h"
31#define DEBUG_TYPE "amdgpu-legalinfo"
34using namespace LegalizeActions;
35using namespace LegalizeMutations;
36using namespace LegalityPredicates;
37using namespace MIPatternMatch;
41 "amdgpu-global-isel-new-legality",
42 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
68 const LLT Ty = Query.Types[TypeIdx];
75 EltSize > 1 && EltSize < 32 &&
82 const LLT Ty = Query.Types[TypeIdx];
89 const LLT Ty = Query.Types[TypeIdx];
97 const LLT Ty = Query.Types[TypeIdx];
99 return std::pair(TypeIdx,
106 const LLT Ty = Query.Types[TypeIdx];
109 unsigned Pieces = (
Size + 63) / 64;
120 const LLT Ty = Query.Types[TypeIdx];
125 const int NextMul32 = (
Size + 31) / 32;
129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
148 const LLT Ty = Query.Types[TypeIdx];
155 const LLT Ty = Query.Types[TypeIdx];
165 const LLT QueryTy = Query.Types[TypeIdx];
172 const LLT QueryTy = Query.Types[TypeIdx];
179 const LLT QueryTy = Query.Types[TypeIdx];
190 return EltSize == 16 || EltSize % 32 == 0;
195 return EltSize == 32 || EltSize == 64 ||
197 EltSize == 128 || EltSize == 256;
220 const LLT QueryTy = Query.Types[TypeIdx];
232 const LLT Ty = Query.Types[TypeIdx];
234 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
242 bool IsLoad,
bool IsAtomic) {
246 return ST.enableFlatScratch() ? 128 : 32;
248 return ST.useDS128() ? 128 : 64;
258 return IsLoad ? 512 : 128;
263 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
272 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
277 unsigned AS = Query.
Types[1].getAddressSpace();
291 if (IsLoad && MemSize <
Size)
292 MemSize = std::max(MemSize,
Align);
301 AtomicOrdering::NotAtomic))
312 if (!ST.hasDwordx3LoadStores())
325 if (AlignBits < MemSize) {
328 Align(AlignBits / 8)))
354 return EltSize != 32 && EltSize != 64;
369 if (
Size != MemSizeInBits)
385 uint64_t AlignInBits,
unsigned AddrSpace,
395 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
406 if (AlignInBits < RoundedSize)
413 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
420 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
425 Query.
Types[1].getAddressSpace(), Opcode);
431 using namespace TargetOpcode;
433 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
477 std::initializer_list<LLT> AllS32Vectors =
478 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
479 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
480 std::initializer_list<LLT> AllS64Vectors =
481 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
491 const LLT CodePtr = FlatPtr;
493 const std::initializer_list<LLT> AddrSpaces64 = {
494 GlobalPtr, ConstantPtr, FlatPtr
497 const std::initializer_list<LLT> AddrSpaces32 = {
498 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
501 const std::initializer_list<LLT> FPTypesBase = {
505 const std::initializer_list<LLT> FPTypes16 = {
509 const std::initializer_list<LLT> FPTypesPK16 = {
521 .
legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
522 .legalFor(AllS32Vectors)
536 .legalFor({S32, S16, V2S16})
537 .clampMaxNumElementsStrict(0, S16, 2)
545 .clampMaxNumElementsStrict(0, S16, 2)
553 .legalFor({S32, S16, V2S16})
554 .minScalarOrElt(0, S16)
561 .legalFor({S32, S16})
571 .widenScalarToNextMultipleOf(0, 32)
578 .legalFor({S32, S16})
593 .widenScalarToNextMultipleOf(0, 32)
601 .widenScalarToNextMultipleOf(0, 32);
606 Mul.maxScalar(0, S32);
612 .minScalarOrElt(0, S32)
631 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
632 .customFor({S32, S64})
633 .clampScalar(0, S32, S64)
643 .clampMaxNumElements(0, S8, 2)
654 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
655 .clampScalar(0, S32, S64)
662 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
663 .legalFor({{S32, S1}, {S32, S32}})
664 .clampScalar(0, S32, S32)
674 .
legalFor({S1, S32, S64, S16, GlobalPtr,
675 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
682 .clampScalar(0, S16, S64);
707 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
708 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
709 .legalFor({S32, S64});
711 .customFor({S32, S64});
726 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
760 .legalFor(FPTypesPK16)
767 .legalFor({S32, S64, S16})
769 .clampScalar(0, S16, S64);
774 .clampScalar(0, S32, S64);
779 .legalFor({S32, S64})
781 .clampScalar(0, S32, S64);
786 .clampScalar(0, S32, S64);
797 .narrowScalarFor({{S64, S16}},
changeTo(0, S32))
806 .lowerFor({S64, V2S16});
812 .lowerFor({S64, S16, V2S16});
822 FMad.customFor({S32, S16});
824 FMad.customFor({S32});
826 FMad.customFor({S16});
832 FRem.customFor({S16, S32, S64});
834 FRem.minScalar(0, S32)
835 .customFor({S32, S64});
843 .clampMaxNumElements(0, S16, 2)
851 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
852 {S32, S1}, {S64, S1}, {S16, S1}})
854 .clampScalar(0, S32, S64)
855 .widenScalarToNextPow2(1, 32);
859 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
870 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
871 .customFor({{S64, S32}, {S64, S64}})
872 .narrowScalarFor({{S64, S16}},
changeTo(0, S32));
883 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
889 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
893 if (
ST.has16BitInsts()) {
894 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
895 .legalFor({S16, S32, S64})
896 .clampScalar(0, S16, S64)
899 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
900 .legalFor({S32, S64})
901 .clampScalar(0, S32, S64)
904 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
907 .clampScalar(0, S32, S64)
911 getActionDefinitionsBuilder(G_PTR_ADD)
914 .scalarSameSizeAs(1, 0);
916 getActionDefinitionsBuilder(G_PTRMASK)
918 .scalarSameSizeAs(1, 0)
922 getActionDefinitionsBuilder(G_ICMP)
933 .legalForCartesianProduct(
934 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
935 .legalForCartesianProduct(
936 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
937 if (
ST.has16BitInsts()) {
938 CmpBuilder.legalFor({{S1, S16}});
942 .widenScalarToNextPow2(1)
943 .clampScalar(1, S32, S64)
947 getActionDefinitionsBuilder(G_FCMP)
948 .legalForCartesianProduct({S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
949 .widenScalarToNextPow2(1)
950 .clampScalar(1, S32, S64)
954 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
955 if (
ST.has16BitInsts())
956 Exp2Ops.legalFor({S32, S16});
958 Exp2Ops.legalFor({S32});
959 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
960 Exp2Ops.scalarize(0);
962 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
963 if (
ST.has16BitInsts())
964 ExpOps.customFor({{S32}, {S16}});
966 ExpOps.customFor({S32});
967 ExpOps.clampScalar(0, MinScalarFPTy, S32)
970 getActionDefinitionsBuilder(G_FPOWI)
971 .clampScalar(0, MinScalarFPTy, S32)
975 getActionDefinitionsBuilder(G_CTPOP)
976 .legalFor({{S32, S32}, {S32, S64}})
977 .clampScalar(0, S32, S32)
978 .widenScalarToNextPow2(1, 32)
979 .clampScalar(1, S32, S64)
981 .widenScalarToNextPow2(0, 32);
984 if (
ST.has16BitInsts())
985 getActionDefinitionsBuilder(G_IS_FPCLASS)
986 .legalForCartesianProduct({S1}, FPTypes16)
987 .widenScalarToNextPow2(1)
991 getActionDefinitionsBuilder(G_IS_FPCLASS)
992 .legalForCartesianProduct({S1}, FPTypesBase)
994 .widenScalarToNextPow2(1)
1001 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1003 .clampScalar(0, S32, S32)
1004 .clampScalar(1, S32, S64)
1005 .widenScalarToNextPow2(0, 32)
1006 .widenScalarToNextPow2(1, 32)
1010 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1011 .legalFor({{S32, S32}, {S32, S64}})
1012 .clampScalar(0, S32, S32)
1013 .clampScalar(1, S32, S64)
1015 .widenScalarToNextPow2(0, 32)
1016 .widenScalarToNextPow2(1, 32);
1020 getActionDefinitionsBuilder(G_BITREVERSE)
1021 .legalFor({S32, S64})
1022 .clampScalar(0, S32, S64)
1024 .widenScalarToNextPow2(0);
1026 if (
ST.has16BitInsts()) {
1027 getActionDefinitionsBuilder(G_BSWAP)
1028 .legalFor({S16, S32, V2S16})
1029 .clampMaxNumElementsStrict(0, S16, 2)
1032 .widenScalarToNextPow2(0)
1033 .clampScalar(0, S16, S32)
1036 if (
ST.hasVOP3PInsts()) {
1037 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1038 .legalFor({S32, S16, V2S16})
1040 .clampMaxNumElements(0, S16, 2)
1042 .widenScalarToNextPow2(0)
1046 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1047 .legalFor({S32, S16})
1048 .widenScalarToNextPow2(0)
1055 getActionDefinitionsBuilder(G_BSWAP)
1060 .widenScalarToNextPow2(0)
1065 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1068 .widenScalarToNextPow2(0)
1073 getActionDefinitionsBuilder(G_INTTOPTR)
1075 .legalForCartesianProduct(AddrSpaces64, {S64})
1076 .legalForCartesianProduct(AddrSpaces32, {S32})
1089 getActionDefinitionsBuilder(G_PTRTOINT)
1091 .legalForCartesianProduct(AddrSpaces64, {S64})
1092 .legalForCartesianProduct(AddrSpaces32, {S32})
1105 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1109 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1110 bool IsLoad) ->
bool {
1114 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1128 unsigned NumRegs = (MemSize + 31) / 32;
1130 if (!
ST.hasDwordx3LoadStores())
1141 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1142 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1143 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1149 for (
unsigned Op : {G_LOAD, G_STORE}) {
1150 const bool IsStore =
Op == G_STORE;
1152 auto &Actions = getActionDefinitionsBuilder(Op);
1155 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1156 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1157 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1158 {S64, GlobalPtr, S64, GlobalAlign32},
1159 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1160 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1161 {S32, GlobalPtr, S8, GlobalAlign8},
1162 {S32, GlobalPtr, S16, GlobalAlign16},
1164 {S32, LocalPtr, S32, 32},
1165 {S64, LocalPtr, S64, 32},
1166 {V2S32, LocalPtr, V2S32, 32},
1167 {S32, LocalPtr, S8, 8},
1168 {S32, LocalPtr, S16, 16},
1169 {V2S16, LocalPtr, S32, 32},
1171 {S32, PrivatePtr, S32, 32},
1172 {S32, PrivatePtr, S8, 8},
1173 {S32, PrivatePtr, S16, 16},
1174 {V2S16, PrivatePtr, S32, 32},
1176 {S32, ConstantPtr, S32, GlobalAlign32},
1177 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1178 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1179 {S64, ConstantPtr, S64, GlobalAlign32},
1180 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1191 Actions.customIf(
typeIs(1, Constant32Ptr));
1217 return !Query.
Types[0].isVector() &&
1218 needToSplitMemOp(Query, Op == G_LOAD);
1220 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1225 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1228 if (DstSize > MemSize)
1234 if (MemSize > MaxSize)
1242 return Query.
Types[0].isVector() &&
1243 needToSplitMemOp(Query, Op == G_LOAD);
1245 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1259 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1260 if (MemSize > MaxSize) {
1264 if (MaxSize % EltSize == 0) {
1270 unsigned NumPieces = MemSize / MaxSize;
1274 if (NumPieces == 1 || NumPieces >= NumElts ||
1275 NumElts % NumPieces != 0)
1276 return std::pair(0, EltTy);
1284 return std::pair(0, EltTy);
1299 return std::pair(0, EltTy);
1303 .widenScalarToNextPow2(0)
1309 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1310 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1311 {S32, GlobalPtr, S16, 2 * 8},
1312 {S32, LocalPtr, S8, 8},
1313 {S32, LocalPtr, S16, 16},
1314 {S32, PrivatePtr, S8, 8},
1315 {S32, PrivatePtr, S16, 16},
1316 {S32, ConstantPtr, S8, 8},
1317 {S32, ConstantPtr, S16, 2 * 8}})
1323 if (
ST.hasFlatAddressSpace()) {
1324 ExtLoads.legalForTypesWithMemDesc(
1325 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1333 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1335 ExtLoads.clampScalar(0, S32, S32)
1336 .widenScalarToNextPow2(0)
1339 auto &Atomics = getActionDefinitionsBuilder(
1340 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1341 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1342 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1343 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1344 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1345 {S64, GlobalPtr}, {S64, LocalPtr},
1346 {S32, RegionPtr}, {S64, RegionPtr}});
1347 if (
ST.hasFlatAddressSpace()) {
1348 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1351 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1352 if (
ST.hasLDSFPAtomicAdd()) {
1353 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1354 if (
ST.hasGFX90AInsts())
1355 Atomic.legalFor({{S64, LocalPtr}});
1356 if (
ST.hasGFX940Insts())
1357 Atomic.legalFor({{V2S16, LocalPtr}});
1359 if (
ST.hasAtomicFaddInsts())
1360 Atomic.legalFor({{S32, GlobalPtr}});
1361 if (
ST.hasFlatAtomicFaddF32Inst())
1362 Atomic.legalFor({{S32, FlatPtr}});
1364 if (
ST.hasGFX90AInsts()) {
1377 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1378 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1379 {S32, FlatPtr}, {S64, FlatPtr}})
1380 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1381 {S32, RegionPtr}, {S64, RegionPtr}});
1385 getActionDefinitionsBuilder(G_SELECT)
1386 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1387 LocalPtr, FlatPtr, PrivatePtr,
1391 .clampScalar(0, S16, S64)
1395 .clampMaxNumElements(0, S32, 2)
1396 .clampMaxNumElements(0, LocalPtr, 2)
1397 .clampMaxNumElements(0, PrivatePtr, 2)
1399 .widenScalarToNextPow2(0)
1404 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1405 .legalFor({{S32, S32}, {S64, S32}});
1406 if (
ST.has16BitInsts()) {
1407 if (
ST.hasVOP3PInsts()) {
1408 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1409 .clampMaxNumElements(0, S16, 2);
1411 Shifts.legalFor({{S16, S16}});
1414 Shifts.widenScalarIf(
1419 const LLT AmountTy = Query.
Types[1];
1423 Shifts.maxScalarIf(
typeIs(0, S16), 1, S16);
1424 Shifts.clampScalar(1, S32, S32);
1425 Shifts.widenScalarToNextPow2(0, 16);
1426 Shifts.clampScalar(0, S16, S64);
1428 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1436 Shifts.clampScalar(1, S32, S32);
1437 Shifts.widenScalarToNextPow2(0, 32);
1438 Shifts.clampScalar(0, S32, S64);
1440 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1445 Shifts.scalarize(0);
1447 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1448 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1449 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1450 unsigned IdxTypeIdx = 2;
1452 getActionDefinitionsBuilder(Op)
1454 const LLT EltTy = Query.
Types[EltTypeIdx];
1455 const LLT VecTy = Query.
Types[VecTypeIdx];
1456 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1458 return (EltSize == 32 || EltSize == 64) &&
1472 const LLT EltTy = Query.
Types[EltTypeIdx];
1473 const LLT VecTy = Query.
Types[VecTypeIdx];
1477 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1482 .clampScalar(EltTypeIdx, S32, S64)
1483 .clampScalar(VecTypeIdx, S32, S64)
1484 .clampScalar(IdxTypeIdx, S32, S32)
1485 .clampMaxNumElements(VecTypeIdx, S32, 32)
1492 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1494 const LLT &EltTy = Query.
Types[1].getElementType();
1495 return Query.
Types[0] != EltTy;
1498 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1499 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1500 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1503 getActionDefinitionsBuilder(Op)
1509 const LLT BigTy = Query.
Types[BigTyIdx];
1514 const LLT BigTy = Query.
Types[BigTyIdx];
1515 const LLT LitTy = Query.
Types[LitTyIdx];
1521 const LLT BigTy = Query.
Types[BigTyIdx];
1527 const LLT LitTy = Query.
Types[LitTyIdx];
1532 .widenScalarToNextPow2(BigTyIdx, 32);
1536 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1537 .legalForCartesianProduct(AllS32Vectors, {S32})
1538 .legalForCartesianProduct(AllS64Vectors, {S64})
1539 .clampNumElements(0, V16S32, V32S32)
1540 .clampNumElements(0, V2S64, V16S64)
1543 if (
ST.hasScalarPackInsts()) {
1546 .minScalarOrElt(0, S16)
1549 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1550 .legalFor({V2S16, S32})
1553 BuildVector.customFor({V2S16, S16});
1554 BuildVector.minScalarOrElt(0, S32);
1556 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1557 .customFor({V2S16, S32})
1564 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1566 .clampMaxNumElements(0, S32, 32)
1567 .clampMaxNumElements(1, S16, 2)
1568 .clampMaxNumElements(0, S16, 64);
1570 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1573 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1574 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1575 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1577 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1578 const LLT Ty = Query.
Types[TypeIdx];
1589 auto &
Builder = getActionDefinitionsBuilder(Op)
1591 .lowerFor({{S16, V2S16}})
1593 const LLT BigTy = Query.
Types[BigTyIdx];
1599 .widenScalarToNextPow2(LitTyIdx, 16)
1607 .clampScalar(LitTyIdx, S32, S512)
1608 .widenScalarToNextPow2(LitTyIdx, 32)
1611 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1614 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1616 .clampScalar(BigTyIdx, S32, MaxScalar);
1618 if (Op == G_MERGE_VALUES) {
1622 const LLT Ty = Query.
Types[LitTyIdx];
1630 const LLT Ty = Query.
Types[BigTyIdx];
1637 const LLT &Ty = Query.
Types[BigTyIdx];
1639 if (NewSizeInBits >= 256) {
1641 if (RoundedTo < NewSizeInBits)
1642 NewSizeInBits = RoundedTo;
1644 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1653 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1654 .legalFor({{S32}, {S64}});
1656 if (
ST.hasVOP3PInsts()) {
1657 SextInReg.lowerFor({{V2S16}})
1661 .clampMaxNumElementsStrict(0, S16, 2);
1662 }
else if (
ST.has16BitInsts()) {
1663 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1667 SextInReg.lowerFor({{S32}, {S64}});
1672 .clampScalar(0, S32, S64)
1675 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1680 getActionDefinitionsBuilder(G_FSHR)
1681 .legalFor({{S32, S32}})
1682 .lowerFor({{V2S16, V2S16}})
1683 .clampMaxNumElementsStrict(0, S16, 2)
1687 if (
ST.hasVOP3PInsts()) {
1688 getActionDefinitionsBuilder(G_FSHL)
1689 .lowerFor({{V2S16, V2S16}})
1690 .clampMaxNumElementsStrict(0, S16, 2)
1694 getActionDefinitionsBuilder(G_FSHL)
1699 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1702 getActionDefinitionsBuilder(G_FENCE)
1705 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1710 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1711 .legalFor({{S32, S32}, {S64, S32}})
1712 .clampScalar(1, S32, S32)
1713 .clampScalar(0, S32, S64)
1714 .widenScalarToNextPow2(0)
1717 getActionDefinitionsBuilder({
1721 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1730 G_FMINIMUM, G_FMAXIMUM}).lower();
1732 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1735 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1736 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1737 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1740 getLegacyLegalizerInfo().computeTables();
1749 switch (
MI.getOpcode()) {
1750 case TargetOpcode::G_ADDRSPACE_CAST:
1752 case TargetOpcode::G_FRINT:
1754 case TargetOpcode::G_FCEIL:
1756 case TargetOpcode::G_FREM:
1758 case TargetOpcode::G_INTRINSIC_TRUNC:
1760 case TargetOpcode::G_SITOFP:
1762 case TargetOpcode::G_UITOFP:
1764 case TargetOpcode::G_FPTOSI:
1766 case TargetOpcode::G_FPTOUI:
1768 case TargetOpcode::G_FMINNUM:
1769 case TargetOpcode::G_FMAXNUM:
1770 case TargetOpcode::G_FMINNUM_IEEE:
1771 case TargetOpcode::G_FMAXNUM_IEEE:
1773 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1775 case TargetOpcode::G_INSERT_VECTOR_ELT:
1777 case TargetOpcode::G_FSIN:
1778 case TargetOpcode::G_FCOS:
1780 case TargetOpcode::G_GLOBAL_VALUE:
1782 case TargetOpcode::G_LOAD:
1783 case TargetOpcode::G_SEXTLOAD:
1784 case TargetOpcode::G_ZEXTLOAD:
1786 case TargetOpcode::G_FMAD:
1788 case TargetOpcode::G_FDIV:
1790 case TargetOpcode::G_UDIV:
1791 case TargetOpcode::G_UREM:
1792 case TargetOpcode::G_UDIVREM:
1794 case TargetOpcode::G_SDIV:
1795 case TargetOpcode::G_SREM:
1796 case TargetOpcode::G_SDIVREM:
1798 case TargetOpcode::G_ATOMIC_CMPXCHG:
1800 case TargetOpcode::G_FLOG:
1802 case TargetOpcode::G_FLOG10:
1804 case TargetOpcode::G_FEXP:
1806 case TargetOpcode::G_FPOW:
1808 case TargetOpcode::G_FFLOOR:
1810 case TargetOpcode::G_BUILD_VECTOR:
1811 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
1813 case TargetOpcode::G_MUL:
1815 case TargetOpcode::G_CTLZ:
1816 case TargetOpcode::G_CTTZ:
1818 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1838 if (ST.hasApertureRegs()) {
1843 ? AMDGPU::SRC_SHARED_BASE
1844 : AMDGPU::SRC_PRIVATE_BASE;
1852 Register Dst =
MRI.createGenericVirtualRegister(S64);
1853 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1854 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
1855 return B.buildUnmerge(S32, Dst).getReg(1);
1860 Register LoadAddr =
MRI.createGenericVirtualRegister(
1869 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
1871 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
1885 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1888 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1891 Register QueuePtr =
MRI.createGenericVirtualRegister(
1907 B.buildPtrAdd(LoadAddr, QueuePtr,
1908 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
1909 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1917 switch (Def->getOpcode()) {
1918 case AMDGPU::G_FRAME_INDEX:
1919 case AMDGPU::G_GLOBAL_VALUE:
1920 case AMDGPU::G_BLOCK_ADDR:
1922 case AMDGPU::G_CONSTANT: {
1923 const ConstantInt *CI = Def->getOperand(1).getCImm();
1942 LLT DstTy =
MRI.getType(Dst);
1943 LLT SrcTy =
MRI.getType(Src);
1954 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1955 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
1964 B.buildExtract(Dst, Src, 0);
1965 MI.eraseFromParent();
1969 unsigned NullVal =
TM.getNullPointerValue(DestAS);
1971 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
1972 auto FlatNull =
B.buildConstant(SrcTy, 0);
1975 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
1979 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1981 MI.eraseFromParent();
1993 Register SrcAsInt =
B.buildPtrToInt(S32, Src).getReg(0);
1997 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2000 B.buildCopy(Dst, BuildPtr);
2001 MI.eraseFromParent();
2005 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2006 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2009 SegmentNull.getReg(0));
2011 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2013 MI.eraseFromParent();
2020 B.buildExtract(Dst, Src, 0);
2021 MI.eraseFromParent();
2029 auto PtrLo =
B.buildPtrToInt(S32, Src);
2030 auto HighAddr =
B.buildConstant(S32, AddrHiVal);
2031 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2032 MI.eraseFromParent();
2037 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2040 Ctx.
diagnose(InvalidAddrSpaceCast);
2042 MI.eraseFromParent();
2050 LLT Ty =
MRI.getType(Src);
2056 auto C1 =
B.buildFConstant(Ty, C1Val);
2057 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2060 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2061 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2063 auto C2 =
B.buildFConstant(Ty, C2Val);
2064 auto Fabs =
B.buildFAbs(Ty, Src);
2067 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2068 MI.eraseFromParent();
2086 auto Trunc =
B.buildIntrinsicTrunc(S64, Src);
2088 const auto Zero =
B.buildFConstant(S64, 0.0);
2089 const auto One =
B.buildFConstant(S64, 1.0);
2092 auto And =
B.buildAnd(S1, Lt0, NeTrunc);
2093 auto Add =
B.buildSelect(S64,
And, One, Zero);
2096 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2097 MI.eraseFromParent();
2105 Register Src0Reg =
MI.getOperand(1).getReg();
2106 Register Src1Reg =
MI.getOperand(2).getReg();
2107 auto Flags =
MI.getFlags();
2108 LLT Ty =
MRI.getType(DstReg);
2110 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2111 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2112 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2113 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2114 MI.eraseFromParent();
2120 const unsigned FractBits = 52;
2121 const unsigned ExpBits = 11;
2124 auto Const0 =
B.buildConstant(S32, FractBits - 32);
2125 auto Const1 =
B.buildConstant(S32, ExpBits);
2127 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32},
false)
2129 .addUse(Const0.getReg(0))
2130 .addUse(Const1.getReg(0));
2132 return B.buildSub(S32, ExpPart,
B.buildConstant(S32, 1023));
2146 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2153 const unsigned FractBits = 52;
2156 const auto SignBitMask =
B.buildConstant(S32, UINT32_C(1) << 31);
2157 auto SignBit =
B.buildAnd(S32,
Hi, SignBitMask);
2159 const auto FractMask =
B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2161 const auto Zero32 =
B.buildConstant(S32, 0);
2164 auto SignBit64 =
B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2166 auto Shr =
B.buildAShr(S64, FractMask, Exp);
2167 auto Not =
B.buildNot(S64, Shr);
2168 auto Tmp0 =
B.buildAnd(S64, Src, Not);
2169 auto FiftyOne =
B.buildConstant(S32, FractBits - 1);
2174 auto Tmp1 =
B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2175 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2176 MI.eraseFromParent();
2192 auto Unmerge =
B.buildUnmerge({S32, S32}, Src);
2193 auto ThirtyTwo =
B.buildConstant(S32, 32);
2195 if (
MRI.getType(Dst) == S64) {
2196 auto CvtHi =
Signed ?
B.buildSITOFP(S64, Unmerge.getReg(1))
2197 :
B.buildUITOFP(S64, Unmerge.getReg(1));
2199 auto CvtLo =
B.buildUITOFP(S64, Unmerge.getReg(0));
2200 auto LdExp =
B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64},
false)
2201 .addUse(CvtHi.getReg(0))
2202 .addUse(ThirtyTwo.getReg(0));
2205 B.buildFAdd(Dst, LdExp, CvtLo);
2206 MI.eraseFromParent();
2212 auto One =
B.buildConstant(S32, 1);
2216 auto ThirtyOne =
B.buildConstant(S32, 31);
2217 auto X =
B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2218 auto OppositeSign =
B.buildAShr(S32,
X, ThirtyOne);
2219 auto MaxShAmt =
B.buildAdd(S32, ThirtyTwo, OppositeSign);
2220 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2222 .addUse(Unmerge.getReg(1));
2223 auto LS2 =
B.buildSub(S32, LS, One);
2224 ShAmt =
B.buildUMin(S32, LS2, MaxShAmt);
2226 ShAmt =
B.buildCTLZ(S32, Unmerge.getReg(1));
2227 auto Norm =
B.buildShl(S64, Src, ShAmt);
2228 auto Unmerge2 =
B.buildUnmerge({S32, S32}, Norm);
2229 auto Adjust =
B.buildUMin(S32, One, Unmerge2.getReg(0));
2230 auto Norm2 =
B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2231 auto FVal =
Signed ?
B.buildSITOFP(S32, Norm2) :
B.buildUITOFP(S32, Norm2);
2232 auto Scale =
B.buildSub(S32, ThirtyTwo, ShAmt);
2235 .addUse(FVal.getReg(0))
2236 .addUse(Scale.getReg(0));
2237 MI.eraseFromParent();
2254 const LLT SrcLT =
MRI.getType(Src);
2255 assert((SrcLT == S32 || SrcLT == S64) &&
MRI.getType(Dst) == S64);
2257 unsigned Flags =
MI.getFlags();
2268 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2270 if (
Signed && SrcLT == S32) {
2276 Sign =
B.buildAShr(S32, Src,
B.buildConstant(S32, 31));
2277 Trunc =
B.buildFAbs(S32, Trunc, Flags);
2281 K0 =
B.buildFConstant(S64,
2283 K1 =
B.buildFConstant(S64,
2286 K0 =
B.buildFConstant(S32,
BitsToFloat(UINT32_C( 0x2f800000)));
2287 K1 =
B.buildFConstant(S32,
BitsToFloat(UINT32_C( 0xcf800000)));
2290 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2291 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2292 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2294 auto Hi = (
Signed && SrcLT == S64) ?
B.buildFPTOSI(S32, FloorMul)
2295 :
B.buildFPTOUI(S32, FloorMul);
2296 auto Lo =
B.buildFPTOUI(S32, Fma);
2298 if (
Signed && SrcLT == S32) {
2300 Sign =
B.buildMergeLikeInstr(S64, {Sign, Sign});
2302 B.buildSub(Dst,
B.buildXor(S64,
B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2305 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2306 MI.eraseFromParent();
2316 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2317 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2340 std::optional<ValueAndVReg> MaybeIdxVal =
2344 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2349 LLT VecTy =
MRI.getType(Vec);
2354 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2355 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2360 MI.eraseFromParent();
2374 std::optional<ValueAndVReg> MaybeIdxVal =
2379 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2384 LLT VecTy =
MRI.getType(Vec);
2390 if (IdxVal < NumElts) {
2392 for (
unsigned i = 0; i < NumElts; ++i)
2393 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2394 B.buildUnmerge(SrcRegs, Vec);
2396 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2397 B.buildMergeLikeInstr(Dst, SrcRegs);
2402 MI.eraseFromParent();
2412 LLT Ty =
MRI.getType(DstReg);
2413 unsigned Flags =
MI.getFlags();
2418 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2419 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty},
false)
2420 .addUse(MulVal.getReg(0))
2421 .setMIFlags(Flags).getReg(0);
2423 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2426 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2430 MI.eraseFromParent();
2438 unsigned GAFlags)
const {
2439 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2476 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2487 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2490 B.buildExtract(DstReg, PCReg, 0);
2498 LLT Ty =
MRI.getType(DstReg);
2510 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2520 B.buildUndef(DstReg);
2521 MI.eraseFromParent();
2541 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2546 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32},
false);
2547 B.buildIntToPtr(DstReg, Sz);
2548 MI.eraseFromParent();
2554 *cast<GlobalVariable>(GV)));
2555 MI.eraseFromParent();
2563 MI.eraseFromParent();
2569 MI.eraseFromParent();
2574 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
2587 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2588 B.buildExtract(DstReg, Load, 0);
2590 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2592 MI.eraseFromParent();
2610 LLT PtrTy =
MRI.getType(PtrReg);
2615 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
2617 MI.getOperand(1).setReg(Cast.getReg(0));
2622 if (
MI.getOpcode() != AMDGPU::G_LOAD)
2626 LLT ValTy =
MRI.getType(ValReg);
2641 if (WideMemSize == ValSize) {
2647 MI.setMemRefs(MF, {WideMMO});
2653 if (ValSize > WideMemSize)
2660 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2661 B.buildTrunc(ValReg, WideLoad).getReg(0);
2668 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2669 B.buildExtract(ValReg, WideLoad, 0);
2673 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2674 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2678 MI.eraseFromParent();
2688 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
2715 "this should not have been custom lowered");
2717 LLT ValTy =
MRI.getType(CmpVal);
2720 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
2722 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2726 .setMemRefs(
MI.memoperands());
2728 MI.eraseFromParent();
2736 LLT Ty =
B.getMRI()->getType(Dst);
2737 unsigned Flags =
MI.getFlags();
2739 auto Log2Operand =
B.buildFLog2(Ty, Src, Flags);
2740 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
2742 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2743 MI.eraseFromParent();
2751 unsigned Flags =
MI.getFlags();
2752 LLT Ty =
B.getMRI()->getType(Dst);
2755 auto Mul =
B.buildFMul(Ty, Src, K, Flags);
2756 B.buildFExp2(Dst,
Mul, Flags);
2757 MI.eraseFromParent();
2766 unsigned Flags =
MI.getFlags();
2767 LLT Ty =
B.getMRI()->getType(Dst);
2772 auto Log =
B.buildFLog2(S32, Src0, Flags);
2773 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2774 .addUse(Log.getReg(0))
2777 B.buildFExp2(Dst,
Mul, Flags);
2778 }
else if (Ty == S16) {
2780 auto Log =
B.buildFLog2(S16, Src0, Flags);
2781 auto Ext0 =
B.buildFPExt(S32, Log, Flags);
2782 auto Ext1 =
B.buildFPExt(S32, Src1, Flags);
2783 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32},
false)
2784 .addUse(Ext0.getReg(0))
2785 .addUse(Ext1.getReg(0))
2788 B.buildFExp2(Dst,
B.buildFPTrunc(S16,
Mul), Flags);
2792 MI.eraseFromParent();
2800 ModSrc = SrcFNeg->getOperand(1).getReg();
2802 ModSrc = SrcFAbs->getOperand(1).getReg();
2804 ModSrc = SrcFAbs->getOperand(1).getReg();
2815 Register OrigSrc =
MI.getOperand(1).getReg();
2816 unsigned Flags =
MI.getFlags();
2818 "this should not have been custom lowered");
2828 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64},
false)
2839 auto Const =
B.buildFConstant(S64,
BitsToDouble(0x3fefffffffffffff));
2841 Register Min =
MRI.createGenericVirtualRegister(S64);
2847 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2849 B.buildFMinNum(Min, Fract, Const, Flags);
2854 CorrectedFract =
B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2857 auto NegFract =
B.buildFNeg(S64, CorrectedFract, Flags);
2858 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2860 MI.eraseFromParent();
2876 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2878 Src0 =
B.buildTrunc(S16,
MI.getOperand(1).getReg()).getReg(0);
2879 Src1 =
B.buildTrunc(S16,
MI.getOperand(2).getReg()).getReg(0);
2882 auto Merge =
B.buildMergeLikeInstr(S32, {Src0, Src1});
2883 B.buildBitcast(Dst,
Merge);
2885 MI.eraseFromParent();
2901 bool UsePartialMad64_32,
bool SeparateOddAlignedProducts)
const {
2915 auto getZero32 = [&]() ->
Register {
2917 Zero32 =
B.buildConstant(S32, 0).getReg(0);
2920 auto getZero64 = [&]() ->
Register {
2922 Zero64 =
B.buildConstant(S64, 0).getReg(0);
2932 if (CarryIn.empty())
2935 bool HaveCarryOut =
true;
2937 if (CarryIn.size() == 1) {
2939 LocalAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2943 CarryAccum = getZero32();
2945 CarryAccum =
B.buildZExt(S32, CarryIn[0]).getReg(0);
2946 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2948 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2953 LocalAccum = getZero32();
2954 HaveCarryOut =
false;
2959 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2960 LocalAccum =
Add.getReg(0);
2974 auto buildMadChain =
2977 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
2978 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
2985 if (LocalAccum.size() == 1 &&
2986 (!UsePartialMad64_32 || !CarryIn.empty())) {
2988 unsigned j1 = DstIndex - j0;
2989 auto Mul =
B.buildMul(S32, Src0[j0], Src1[j1]);
2990 if (!LocalAccum[0]) {
2991 LocalAccum[0] =
Mul.getReg(0);
2993 if (CarryIn.empty()) {
2994 LocalAccum[0] =
B.buildAdd(S32, LocalAccum[0],
Mul).getReg(0);
2997 B.buildUAdde(S32, S1, LocalAccum[0],
Mul, CarryIn.back())
3003 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3007 if (j0 <= DstIndex) {
3008 bool HaveSmallAccum =
false;
3011 if (LocalAccum[0]) {
3012 if (LocalAccum.size() == 1) {
3013 Tmp =
B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3014 HaveSmallAccum =
true;
3015 }
else if (LocalAccum[1]) {
3016 Tmp =
B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3017 HaveSmallAccum =
false;
3019 Tmp =
B.buildZExt(S64, LocalAccum[0]).getReg(0);
3020 HaveSmallAccum =
true;
3023 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3025 HaveSmallAccum =
true;
3029 unsigned j1 = DstIndex - j0;
3030 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3031 {Src0[j0], Src1[j1], Tmp});
3032 Tmp = Mad.getReg(0);
3033 if (!HaveSmallAccum)
3034 CarryOut.push_back(Mad.getReg(1));
3035 HaveSmallAccum =
false;
3037 }
while (j0 <= DstIndex);
3039 auto Unmerge =
B.buildUnmerge(S32, Tmp);
3040 LocalAccum[0] = Unmerge.getReg(0);
3041 if (LocalAccum.size() > 1)
3042 LocalAccum[1] = Unmerge.getReg(1);
3069 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
3070 Carry OddCarryIn = std::move(OddCarry);
3071 Carry EvenCarryIn = std::move(EvenCarry);
3076 if (2 * i < Accum.
size()) {
3077 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
3078 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3083 if (!SeparateOddAlignedProducts) {
3084 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
3085 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3087 bool IsHighest = 2 * i >= Accum.
size();
3091 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3097 Lo =
B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3099 Lo =
B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3101 Lo =
B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3104 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
3107 auto Hi =
B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3108 Lo->getOperand(1).getReg());
3109 Accum[2 * i] =
Hi.getReg(0);
3110 SeparateOddCarry =
Hi.getReg(1);
3117 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3118 EvenCarryIn.push_back(CarryOut);
3120 if (2 * i < Accum.
size()) {
3121 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3122 OddCarry.push_back(CarryOut);
3135 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
3144 LLT Ty =
MRI.getType(DstReg);
3148 unsigned NumParts =
Size / 32;
3164 for (
unsigned i = 0; i < NumParts; ++i) {
3165 Src0Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
3166 Src1Parts.
push_back(
MRI.createGenericVirtualRegister(S32));
3168 B.buildUnmerge(Src0Parts, Src0);
3169 B.buildUnmerge(Src1Parts, Src1);
3172 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3173 SeparateOddAlignedProducts);
3175 B.buildMergeLikeInstr(DstReg, AccumRegs);
3176 MI.eraseFromParent();
3189 LLT DstTy =
MRI.getType(Dst);
3190 LLT SrcTy =
MRI.getType(Src);
3192 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
3193 ? AMDGPU::G_AMDGPU_FFBH_U32
3194 : AMDGPU::G_AMDGPU_FFBL_B32;
3195 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
3198 MI.eraseFromParent();
3204 if (
MI.getOpcode() != TargetOpcode::G_XOR)
3207 return ConstVal && *ConstVal == -1;
3214 Register CondDef =
MI.getOperand(0).getReg();
3215 if (!
MRI.hasOneNonDBGUse(CondDef))
3223 if (!
MRI.hasOneNonDBGUse(NegatedCond))
3229 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
3238 if (Next == Parent->
end()) {
3242 UncondBrTarget = &*NextMBB;
3244 if (Next->getOpcode() != AMDGPU::G_BR)
3262 *ArgRC,
B.getDebugLoc(), ArgTy);
3263 if (
Arg->isMasked()) {
3266 const unsigned Mask =
Arg->getMask();
3267 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
3274 auto ShiftAmt =
B.buildConstant(S32, Shift);
3275 AndMaskSrc =
B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3278 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(S32, Mask >> Shift));
3280 B.buildCopy(DstReg, LiveIn);
3299 B.buildConstant(DstReg, 0);
3305 B.buildUndef(DstReg);
3309 if (!
Arg->isRegister() || !
Arg->getRegister().isValid())
3320 MI.eraseFromParent();
3326 B.buildConstant(
MI.getOperand(0).getReg(),
C);
3327 MI.eraseFromParent();
3348 B.buildUndef(DstReg);
3349 MI.eraseFromParent();
3353 if (
Arg->isMasked()) {
3367 MI.eraseFromParent();
3374 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
3384 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3392 Align Alignment)
const {
3396 "unexpected kernarg parameter type");
3400 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
3403 MI.eraseFromParent();
3411 LLT DstTy =
MRI.getType(Dst);
3438 auto FloatY =
B.buildUITOFP(S32,
Y);
3439 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3440 auto Scale =
B.buildFConstant(S32,
BitsToFloat(0x4f7ffffe));
3441 auto ScaledY =
B.buildFMul(S32, RcpIFlag, Scale);
3442 auto Z =
B.buildFPTOUI(S32, ScaledY);
3445 auto NegY =
B.buildSub(S32,
B.buildConstant(S32, 0),
Y);
3446 auto NegYZ =
B.buildMul(S32, NegY, Z);
3447 Z =
B.buildAdd(S32, Z,
B.buildUMulH(S32, Z, NegYZ));
3450 auto Q =
B.buildUMulH(S32,
X, Z);
3451 auto R =
B.buildSub(S32,
X,
B.buildMul(S32, Q,
Y));
3454 auto One =
B.buildConstant(S32, 1);
3457 Q =
B.buildSelect(S32,
Cond,
B.buildAdd(S32, Q, One), Q);
3458 R =
B.buildSelect(S32,
Cond,
B.buildSub(S32, R,
Y), R);
3463 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(S32, Q, One), Q);
3466 B.buildSelect(DstRemReg,
Cond,
B.buildSub(S32, R,
Y), R);
3485 auto Unmerge =
B.buildUnmerge(S32, Val);
3487 auto CvtLo =
B.buildUITOFP(S32, Unmerge.getReg(0));
3488 auto CvtHi =
B.buildUITOFP(S32, Unmerge.getReg(1));
3490 auto Mad =
B.buildFMAD(S32, CvtHi,
3491 B.buildFConstant(S32,
BitsToFloat(0x4f800000)), CvtLo);
3493 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3495 B.buildFMul(S32, Rcp,
B.buildFConstant(S32,
BitsToFloat(0x5f7ffffc)));
3499 B.buildFMul(S32, Mul1,
B.buildFConstant(S32,
BitsToFloat(0x2f800000)));
3500 auto Trunc =
B.buildIntrinsicTrunc(S32, Mul2);
3503 auto Mad2 =
B.buildFMAD(S32, Trunc,
3506 auto ResultLo =
B.buildFPTOUI(S32, Mad2);
3507 auto ResultHi =
B.buildFPTOUI(S32, Trunc);
3509 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3524 auto Rcp =
B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
3526 auto Zero64 =
B.buildConstant(S64, 0);
3527 auto NegDenom =
B.buildSub(S64, Zero64, Denom);
3529 auto MulLo1 =
B.buildMul(S64, NegDenom, Rcp);
3530 auto MulHi1 =
B.buildUMulH(S64, Rcp, MulLo1);
3532 auto UnmergeMulHi1 =
B.buildUnmerge(S32, MulHi1);
3533 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3534 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3536 auto Add1_Lo =
B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3537 auto Add1_Hi =
B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3538 auto Add1 =
B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
3540 auto MulLo2 =
B.buildMul(S64, NegDenom, Add1);
3541 auto MulHi2 =
B.buildUMulH(S64, Add1, MulLo2);
3542 auto UnmergeMulHi2 =
B.buildUnmerge(S32, MulHi2);
3543 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3544 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3546 auto Zero32 =
B.buildConstant(S32, 0);
3547 auto Add2_Lo =
B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3548 auto Add2_Hi =
B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3549 auto Add2 =
B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
3551 auto UnmergeNumer =
B.buildUnmerge(S32, Numer);
3552 Register NumerLo = UnmergeNumer.getReg(0);
3553 Register NumerHi = UnmergeNumer.getReg(1);
3555 auto MulHi3 =
B.buildUMulH(S64, Numer, Add2);
3556 auto Mul3 =
B.buildMul(S64, Denom, MulHi3);
3557 auto UnmergeMul3 =
B.buildUnmerge(S32, Mul3);
3558 Register Mul3_Lo = UnmergeMul3.getReg(0);
3559 Register Mul3_Hi = UnmergeMul3.getReg(1);
3560 auto Sub1_Lo =
B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3561 auto Sub1_Hi =
B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3562 auto Sub1_Mi =
B.buildSub(S32, NumerHi, Mul3_Hi);
3563 auto Sub1 =
B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
3565 auto UnmergeDenom =
B.buildUnmerge(S32, Denom);
3566 Register DenomLo = UnmergeDenom.getReg(0);
3567 Register DenomHi = UnmergeDenom.getReg(1);
3570 auto C1 =
B.buildSExt(S32, CmpHi);
3573 auto C2 =
B.buildSExt(S32, CmpLo);
3576 auto C3 =
B.buildSelect(S32, CmpEq, C2, C1);
3583 auto Sub2_Lo =
B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3584 auto Sub2_Mi =
B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3585 auto Sub2_Hi =
B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3586 auto Sub2 =
B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
3588 auto One64 =
B.buildConstant(S64, 1);
3589 auto Add3 =
B.buildAdd(S64, MulHi3, One64);
3595 auto C6 =
B.buildSelect(
3599 auto Add4 =
B.buildAdd(S64, Add3, One64);
3600 auto Sub3_Lo =
B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3602 auto Sub3_Mi =
B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3603 auto Sub3_Hi =
B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3604 auto Sub3 =
B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
3610 auto Sel1 =
B.buildSelect(
3617 auto Sel2 =
B.buildSelect(
3628 switch (
MI.getOpcode()) {
3631 case AMDGPU::G_UDIV: {
3632 DstDivReg =
MI.getOperand(0).getReg();
3635 case AMDGPU::G_UREM: {
3636 DstRemReg =
MI.getOperand(0).getReg();
3639 case AMDGPU::G_UDIVREM: {
3640 DstDivReg =
MI.getOperand(0).getReg();
3641 DstRemReg =
MI.getOperand(1).getReg();
3648 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3649 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
3650 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
3651 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3660 MI.eraseFromParent();
3670 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3671 if (Ty != S32 && Ty != S64)
3674 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
3678 auto SignBitOffset =
B.buildConstant(S32, Ty.
getSizeInBits() - 1);
3679 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
3680 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
3682 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
3683 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
3685 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
3686 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
3688 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3689 switch (
MI.getOpcode()) {
3692 case AMDGPU::G_SDIV: {
3693 DstDivReg =
MI.getOperand(0).getReg();
3694 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
3697 case AMDGPU::G_SREM: {
3698 DstRemReg =
MI.getOperand(0).getReg();
3699 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
3702 case AMDGPU::G_SDIVREM: {
3703 DstDivReg =
MI.getOperand(0).getReg();
3704 DstRemReg =
MI.getOperand(1).getReg();
3705 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
3706 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
3717 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
3718 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3719 B.buildSub(DstDivReg, SignXor, Sign);
3723 auto Sign = LHSign.getReg(0);
3724 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3725 B.buildSub(DstRemReg, SignXor, Sign);
3728 MI.eraseFromParent();
3739 LLT ResTy =
MRI.getType(Res);
3745 if (!AllowInaccurateRcp)
3750 if (CLHS->isExactlyValue(1.0)) {
3751 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3755 MI.eraseFromParent();
3760 if (CLHS->isExactlyValue(-1.0)) {
3761 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
3762 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res,
false)
3763 .addUse(FNeg.getReg(0))
3766 MI.eraseFromParent();
3772 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3775 B.buildFMul(Res,
LHS, RCP, Flags);
3777 MI.eraseFromParent();
3788 LLT ResTy =
MRI.getType(Res);
3794 if (!AllowInaccurateRcp)
3797 auto NegY =
B.buildFNeg(ResTy,
Y);
3798 auto One =
B.buildFConstant(ResTy, 1.0);
3800 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy},
false)
3804 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
3805 R =
B.buildFMA(ResTy, Tmp0, R, R);
3807 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
3808 R =
B.buildFMA(ResTy, Tmp1, R, R);
3810 auto Ret =
B.buildFMul(ResTy,
X, R);
3811 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
3813 B.buildFMA(Res, Tmp2, R, Ret);
3814 MI.eraseFromParent();
3833 auto LHSExt =
B.buildFPExt(S32,
LHS, Flags);
3834 auto RHSExt =
B.buildFPExt(S32,
RHS, Flags);
3836 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3837 .addUse(RHSExt.getReg(0))
3840 auto QUOT =
B.buildFMul(S32, LHSExt, RCP, Flags);
3841 auto RDst =
B.buildFPTrunc(S16, QUOT, Flags);
3843 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3844 .addUse(RDst.getReg(0))
3849 MI.eraseFromParent();
3860 unsigned SPDenormMode =
3863 if (ST.hasDenormModeInst()) {
3865 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3867 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3868 B.buildInstr(AMDGPU::S_DENORM_MODE)
3869 .addImm(NewDenormModeValue);
3877 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3878 .addImm(SPDenormMode)
3879 .addImm(SPDenormModeBitField);
3900 auto One =
B.buildFConstant(S32, 1.0f);
3902 auto DenominatorScaled =
3903 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3908 auto NumeratorScaled =
3909 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1},
false)
3915 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
3916 .addUse(DenominatorScaled.getReg(0))
3918 auto NegDivScale0 =
B.buildFNeg(S32, DenominatorScaled, Flags);
3922 if (!Mode.allFP32Denormals())
3925 auto Fma0 =
B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3926 auto Fma1 =
B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3927 auto Mul =
B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3928 auto Fma2 =
B.buildFMA(S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
3929 auto Fma3 =
B.buildFMA(S32, Fma2, Fma1,
Mul, Flags);
3930 auto Fma4 =
B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3932 if (!Mode.allFP32Denormals())
3935 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32},
false)
3936 .addUse(Fma4.getReg(0))
3937 .addUse(Fma1.getReg(0))
3938 .addUse(Fma3.getReg(0))
3939 .addUse(NumeratorScaled.getReg(1))
3942 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res,
false)
3943 .addUse(Fmas.getReg(0))
3948 MI.eraseFromParent();
3967 auto One =
B.buildFConstant(S64, 1.0);
3969 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
3975 auto NegDivScale0 =
B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3977 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64},
false)
3978 .addUse(DivScale0.getReg(0))
3981 auto Fma0 =
B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3982 auto Fma1 =
B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3983 auto Fma2 =
B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3985 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1},
false)
3991 auto Fma3 =
B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3992 auto Mul =
B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3993 auto Fma4 =
B.buildFMA(S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
4002 auto NumUnmerge =
B.buildUnmerge(S32,
LHS);
4003 auto DenUnmerge =
B.buildUnmerge(S32,
RHS);
4004 auto Scale0Unmerge =
B.buildUnmerge(S32, DivScale0);
4005 auto Scale1Unmerge =
B.buildUnmerge(S32, DivScale1);
4008 Scale1Unmerge.getReg(1));
4010 Scale0Unmerge.getReg(1));
4011 Scale =
B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4013 Scale = DivScale1.getReg(1);
4016 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64},
false)
4017 .addUse(Fma4.getReg(0))
4018 .addUse(Fma3.getReg(0))
4019 .addUse(
Mul.getReg(0))
4023 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res),
false)
4024 .addUse(Fmas.getReg(0))
4029 MI.eraseFromParent();
4044 auto Abs =
B.buildFAbs(S32,
RHS, Flags);
4047 auto C0 =
B.buildConstant(S32, 0x6f800000);
4048 auto C1 =
B.buildConstant(S32, 0x2f800000);
4052 auto Sel =
B.buildSelect(S32, CmpRes, C1, C2, Flags);
4054 auto Mul0 =
B.buildFMul(S32,
RHS, Sel, Flags);
4056 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32},
false)
4057 .addUse(Mul0.getReg(0))
4060 auto Mul1 =
B.buildFMul(S32,
LHS, RCP, Flags);
4062 B.buildFMul(Res, Sel, Mul1, Flags);
4064 MI.eraseFromParent();
4082 auto Flags =
MI.getFlags();
4084 LLT Ty =
MRI.getType(Dst);
4094 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty},
false)
4104 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4105 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4110 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4112 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4113 MI.eraseFromParent();
4119 case Intrinsic::amdgcn_ds_fadd:
4120 return AMDGPU::G_ATOMICRMW_FADD;
4121 case Intrinsic::amdgcn_ds_fmin:
4122 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4123 case Intrinsic::amdgcn_ds_fmax:
4124 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4140 for (
int I = 6;
I > 3; --
I)
4141 MI.removeOperand(
I);
4143 MI.removeOperand(1);
4154 LLT DstTy =
MRI.getType(DstReg);
4157 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
4163 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
4180 MI.eraseFromParent();
4188 std::optional<uint32_t> KnownSize =
4190 if (KnownSize.has_value())
4191 B.buildConstant(DstReg, *KnownSize);
4209 MI.eraseFromParent();
4216 unsigned AddrSpace)
const {
4218 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
4222 MI.eraseFromParent();
4232std::pair<Register, unsigned>
4235 const unsigned MaxImm = 4095;
4241 std::tie(BaseReg, ImmOffset) =
4245 if (
MRI.getType(BaseReg).isPointer())
4246 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
4255 unsigned Overflow = ImmOffset & ~MaxImm;
4256 ImmOffset -= Overflow;
4257 if ((int32_t)Overflow < 0) {
4258 Overflow += ImmOffset;
4262 if (Overflow != 0) {
4264 BaseReg =
B.buildConstant(S32, Overflow).getReg(0);
4266 auto OverflowVal =
B.buildConstant(S32, Overflow);
4267 BaseReg =
B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4272 BaseReg =
B.buildConstant(S32, 0).getReg(0);
4274 return std::pair(BaseReg, ImmOffset);
4280 unsigned ImmOffset,
Register VIndex,
4282 std::optional<ValueAndVReg> MaybeVOffsetVal =
4284 std::optional<ValueAndVReg> MaybeSOffsetVal =
4286 std::optional<ValueAndVReg> MaybeVIndexVal =
4291 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4292 MaybeVIndexVal->Value == 0) {
4293 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4294 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4306 bool ImageStore)
const {
4309 LLT StoreVT =
MRI.getType(Reg);
4313 auto Unmerge =
B.buildUnmerge(S16, Reg);
4316 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4317 WideRegs.
push_back(
B.buildAnyExt(S32, Unmerge.getReg(
I)).getReg(0));
4328 Reg =
B.buildBitcast(S32, Reg).getReg(0);
4330 PackedRegs.
resize(2,
B.buildUndef(S32).getReg(0));
4337 auto Unmerge =
B.buildUnmerge(S16, Reg);
4338 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4340 PackedRegs.
resize(6,
B.buildUndef(S16).getReg(0));
4348 auto Unmerge =
B.buildUnmerge(S32, Reg);
4349 for (
int I = 0,
E = Unmerge->getNumOperands() - 1;
I !=
E; ++
I)
4351 PackedRegs.
resize(4,
B.buildUndef(S32).getReg(0));
4369 LLT Ty =
MRI->getType(VData);
4393 bool IsFormat)
const {
4395 LLT Ty =
MRI.getType(VData);
4397 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4404 const int MemSize = MMO->
getSize();
4409 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4412 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4416 VIndex =
MI.getOperand(3).getReg();
4419 VIndex =
B.buildConstant(S32, 0).getReg(0);
4422 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4423 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4427 Format =
MI.getOperand(5 + OpOffset).getImm();
4431 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4438 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4439 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4440 }
else if (IsFormat) {
4441 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4442 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4446 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4449 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4452 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4457 auto MIB =
B.buildInstr(Opc)
4468 MIB.addImm(AuxiliaryData)
4469 .addImm(HasVIndex ? -1 : 0)
4470 .addMemOperand(MMO);
4472 MI.eraseFromParent();
4478 unsigned ImmOffset,
unsigned Format,
4481 auto MIB =
B.buildInstr(Opc)
4492 MIB.addImm(AuxiliaryData)
4493 .addImm(HasVIndex ? -1 : 0)
4494 .addMemOperand(MMO);
4501 bool IsTyped)
const {
4511 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
4512 bool IsTFE =
MI.getNumExplicitDefs() == 2;
4514 StatusDst =
MI.getOperand(1).getReg();
4518 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
4521 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4524 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
4527 VIndex =
MI.getOperand(3 + OpOffset).getReg();
4530 VIndex =
B.buildConstant(S32, 0).getReg(0);
4533 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
4534 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
4538 Format =
MI.getOperand(5 + OpOffset).getImm();
4542 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
4545 LLT Ty =
MRI.getType(Dst);
4547 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
4559 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4560 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4561 }
else if (IsFormat) {
4565 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
4567 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4568 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4575 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4578 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4581 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4588 unsigned NumLoadDWords = NumValueDWords + 1;
4590 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
4591 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4592 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4593 if (NumValueDWords == 1) {
4594 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4597 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
4598 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(S32));
4600 B.buildUnmerge(LoadElts, LoadDstReg);
4602 B.buildMergeLikeInstr(Dst, LoadElts);
4606 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(S32);
4607 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4608 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4609 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
4610 B.buildTrunc(Dst, LoadDstReg);
4611 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
4613 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4614 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4615 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4616 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
4618 auto Unmerge =
B.buildUnmerge(S32, LoadDstReg);
4620 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
4621 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
4622 B.buildMergeLikeInstr(Dst, Repack);
4625 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
4628 MI.eraseFromParent();
4635 unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP :
4636 AMDGPU::G_ATOMICRMW_UDEC_WRAP;
4638 .addDef(
MI.getOperand(0).getReg())
4639 .addUse(
MI.getOperand(2).getReg())
4640 .addUse(
MI.getOperand(3).getReg())
4642 MI.eraseFromParent();
4648 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4649 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4651 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4652 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4653 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4654 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4655 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4656 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4657 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4658 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4659 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4660 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4661 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4662 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4663 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4664 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4665 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4666 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4667 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4668 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4669 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4670 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4671 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4672 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4673 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4674 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4675 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4676 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4677 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4678 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4679 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4680 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4681 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4682 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4683 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4684 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4685 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4686 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4687 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4688 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4689 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4690 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4691 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4692 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4693 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4694 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4695 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4704 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4705 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4706 const bool HasReturn =
MI.getNumExplicitDefs() != 0;
4713 Dst =
MI.getOperand(0).getReg();
4718 Register VData =
MI.getOperand(2 + OpOffset).getReg();
4722 CmpVal =
MI.getOperand(3 + OpOffset).getReg();
4726 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
4727 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4730 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
4733 VIndex =
MI.getOperand(4 + OpOffset).getReg();
4739 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
4740 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
4741 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
4764 .addImm(AuxiliaryData)
4765 .addImm(HasVIndex ? -1 : 0)
4766 .addMemOperand(MMO);
4768 MI.eraseFromParent();
4778 bool IsA16,
bool IsG16) {
4781 auto EndIdx =
Intr->VAddrEnd;
4783 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
4790 if ((I < Intr->GradientStart) ||
4791 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4792 (
I >=
Intr->CoordStart && !IsA16)) {
4793 if ((I < Intr->GradientStart) && IsA16 &&
4794 (
B.getMRI()->getType(AddrReg) == S16)) {
4795 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
4799 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4803 "Bias needs to be converted to 16 bit in A16 mode");
4805 AddrReg =
B.buildBitcast(V2S16, AddrReg).getReg(0);
4811 if (((
I + 1) >= EndIdx) ||
4812 ((
Intr->NumGradients / 2) % 2 == 1 &&
4813 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
4814 (
Intr->NumGradients / 2) - 1) ||
4815 I ==
static_cast<unsigned>(
Intr->GradientStart +
4816 Intr->NumGradients - 1))) ||
4818 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
4820 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4825 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4836 int DimIdx,
int NumVAddrs) {
4840 for (
int I = 0;
I != NumVAddrs; ++
I) {
4842 if (
SrcOp.isReg()) {
4848 int NumAddrRegs = AddrRegs.
size();
4849 if (NumAddrRegs != 1) {
4852 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4855 for (
int I = 1;
I != NumVAddrs; ++
I) {
4858 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
4880 const unsigned NumDefs =
MI.getNumExplicitDefs();
4881 const unsigned ArgOffset = NumDefs + 1;
4882 bool IsTFE = NumDefs == 2;
4896 Register VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4897 LLT Ty =
MRI->getType(VData);
4901 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
4903 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
4904 const bool IsG16 = GradTy == S16;
4905 const bool IsA16 = AddrTy == S16;
4909 if (!BaseOpcode->
Atomic) {
4910 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
4913 }
else if (DMask != 0) {
4915 }
else if (!IsTFE && !BaseOpcode->
Store) {
4917 B.buildUndef(
MI.getOperand(0));
4918 MI.eraseFromParent();
4926 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4927 : AMDGPU::G_AMDGPU_INTRI