43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
55#define DEBUG_TYPE "si-lower"
61 cl::desc(
"Do not align and prefetch loops"),
65 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
66 cl::desc(
"Use indirect register addressing for divergent indexes"),
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
83 return AMDGPU::SGPR0 +
Reg;
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
218 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
368 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
382 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
396 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
410 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
424 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
439 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
443 if (Subtarget->hasPkMovB32()) {
464 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
470 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
474 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
501 if (Subtarget->hasSMemRealTime() ||
506 if (Subtarget->has16BitInsts()) {
516 if (Subtarget->hasMadMacF32Insts())
534 if (Subtarget->hasIntClamp())
537 if (Subtarget->hasAddNoCarryInsts())
543 {MVT::f32, MVT::f64},
Custom);
549 {MVT::f32, MVT::f64},
Legal);
551 if (Subtarget->haveRoundOpsF64())
581 if (Subtarget->has16BitInsts()) {
634 if (Subtarget->hasBF16TransInsts())
657 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
658 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
659 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
794 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
795 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
796 MVT::v32f16, MVT::v32bf16},
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
810 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
814 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
815 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
823 if (Subtarget->hasVOP3PInsts()) {
834 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
837 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
838 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
839 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
842 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
850 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
857 {MVT::v2f16, MVT::v4f16},
Custom);
863 if (Subtarget->hasBF16PackedInsts()) {
864 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
870 if (Subtarget->hasPackedFP32Ops()) {
874 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
881 if (Subtarget->has16BitInsts()) {
894 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
895 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
896 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
897 MVT::v32f16, MVT::v32bf16},
902 if (Subtarget->hasVectorMulU64())
904 else if (Subtarget->hasScalarSMulU64())
907 if (Subtarget->hasMad64_32())
910 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
913 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
915 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
918 if (Subtarget->hasMinimum3Maximum3F32())
921 if (Subtarget->hasMinimum3Maximum3PKF16()) {
925 if (!Subtarget->hasMinimum3Maximum3F16())
930 if (Subtarget->hasVOP3PInsts()) {
933 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
937 if (Subtarget->hasIntMinMax64())
942 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
943 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
948 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
949 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
950 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
951 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
955 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
956 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
957 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
958 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
973 if (Subtarget->hasBF16ConversionInsts()) {
978 if (Subtarget->hasBF16PackedInsts()) {
984 if (Subtarget->hasBF16TransInsts()) {
988 if (Subtarget->hasCvtPkF16F32Inst()) {
990 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1041 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1082 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1095 EVT DestVT,
EVT SrcVT)
const {
1097 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1098 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1100 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1107 LLT DestTy,
LLT SrcTy)
const {
1108 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1109 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1111 SrcTy.getScalarSizeInBits() == 16 &&
1132 return Subtarget->has16BitInsts()
1138 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1142 if (!Subtarget->has16BitInsts() && VT.
getSizeInBits() == 16)
1164 return (NumElts + 1) / 2;
1170 return NumElts * ((
Size + 31) / 32);
1179 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1188 MVT SimpleIntermediateVT =
1190 IntermediateVT = SimpleIntermediateVT;
1191 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1192 NumIntermediates = (NumElts + 1) / 2;
1193 return (NumElts + 1) / 2;
1198 IntermediateVT = RegisterVT;
1199 NumIntermediates = NumElts;
1200 return NumIntermediates;
1205 RegisterVT = MVT::i16;
1206 IntermediateVT = ScalarVT;
1207 NumIntermediates = NumElts;
1208 return NumIntermediates;
1212 RegisterVT = MVT::i32;
1213 IntermediateVT = ScalarVT;
1214 NumIntermediates = NumElts;
1215 return NumIntermediates;
1219 RegisterVT = MVT::i32;
1220 IntermediateVT = RegisterVT;
1221 NumIntermediates = NumElts * ((
Size + 31) / 32);
1222 return NumIntermediates;
1227 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1232 unsigned MaxNumLanes) {
1233 assert(MaxNumLanes != 0);
1237 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1248 unsigned MaxNumLanes) {
1254 assert(ST->getNumContainedTypes() == 2 &&
1255 ST->getContainedType(1)->isIntegerTy(32));
1269 return MVT::amdgpuBufferFatPointer;
1271 DL.getPointerSizeInBits(AS) == 192)
1272 return MVT::amdgpuBufferStridedPointer;
1281 DL.getPointerSizeInBits(AS) == 160) ||
1283 DL.getPointerSizeInBits(AS) == 192))
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1294 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1295 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1296 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1297 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1298 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1299 case Intrinsic::amdgcn_flat_load_monitor_b32:
1300 case Intrinsic::amdgcn_global_load_monitor_b32:
1302 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1303 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1304 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1305 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1306 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1307 case Intrinsic::amdgcn_flat_load_monitor_b64:
1308 case Intrinsic::amdgcn_global_load_monitor_b64:
1310 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1311 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1312 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1313 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1314 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1315 case Intrinsic::amdgcn_flat_load_monitor_b128:
1316 case Intrinsic::amdgcn_global_load_monitor_b128:
1352 unsigned IntrID)
const {
1354 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1368 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1381 if (RsrcIntr->IsImage) {
1396 Info.ptrVal = RsrcArg;
1400 if (RsrcIntr->IsImage) {
1401 unsigned MaxNumLanes = 4;
1416 std::numeric_limits<unsigned>::max());
1426 if (RsrcIntr->IsImage) {
1446 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1448 Info.memVT = MVT::i32;
1455 case Intrinsic::amdgcn_raw_buffer_load_lds:
1456 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1457 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1458 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1459 case Intrinsic::amdgcn_struct_buffer_load_lds:
1460 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1461 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1462 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1476 CI.
getContext(), Width * 8 * Subtarget->getWavefrontSize());
1485 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1486 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1487 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1488 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1491 std::numeric_limits<unsigned>::max());
1504 case Intrinsic::amdgcn_ds_ordered_add:
1505 case Intrinsic::amdgcn_ds_ordered_swap: {
1519 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1520 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1523 Info.ptrVal =
nullptr;
1529 case Intrinsic::amdgcn_ds_append:
1530 case Intrinsic::amdgcn_ds_consume: {
1544 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1545 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1546 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1551 Info.memVT = MVT::i64;
1558 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1559 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1560 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1563 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1566 ->getElementType(0));
1575 case Intrinsic::amdgcn_global_atomic_fmin_num:
1576 case Intrinsic::amdgcn_global_atomic_fmax_num:
1577 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1578 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1579 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1590 case Intrinsic::amdgcn_cluster_load_b32:
1591 case Intrinsic::amdgcn_cluster_load_b64:
1592 case Intrinsic::amdgcn_cluster_load_b128:
1593 case Intrinsic::amdgcn_ds_load_tr6_b96:
1594 case Intrinsic::amdgcn_ds_load_tr4_b64:
1595 case Intrinsic::amdgcn_ds_load_tr8_b64:
1596 case Intrinsic::amdgcn_ds_load_tr16_b128:
1597 case Intrinsic::amdgcn_global_load_tr6_b96:
1598 case Intrinsic::amdgcn_global_load_tr4_b64:
1599 case Intrinsic::amdgcn_global_load_tr_b64:
1600 case Intrinsic::amdgcn_global_load_tr_b128:
1601 case Intrinsic::amdgcn_ds_read_tr4_b64:
1602 case Intrinsic::amdgcn_ds_read_tr6_b96:
1603 case Intrinsic::amdgcn_ds_read_tr8_b64:
1604 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1613 case Intrinsic::amdgcn_flat_load_monitor_b32:
1614 case Intrinsic::amdgcn_flat_load_monitor_b64:
1615 case Intrinsic::amdgcn_flat_load_monitor_b128:
1616 case Intrinsic::amdgcn_global_load_monitor_b32:
1617 case Intrinsic::amdgcn_global_load_monitor_b64:
1618 case Intrinsic::amdgcn_global_load_monitor_b128: {
1629 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1630 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1631 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1642 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1643 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1644 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1655 case Intrinsic::amdgcn_ds_gws_init:
1656 case Intrinsic::amdgcn_ds_gws_barrier:
1657 case Intrinsic::amdgcn_ds_gws_sema_v:
1658 case Intrinsic::amdgcn_ds_gws_sema_br:
1659 case Intrinsic::amdgcn_ds_gws_sema_p:
1660 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1670 Info.memVT = MVT::i32;
1672 Info.align =
Align(4);
1674 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1681 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1682 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1683 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1684 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1685 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1686 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1687 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1688 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1703 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1704 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1705 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1706 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1721 case Intrinsic::amdgcn_load_to_lds:
1722 case Intrinsic::amdgcn_load_async_to_lds:
1723 case Intrinsic::amdgcn_global_load_lds:
1724 case Intrinsic::amdgcn_global_load_async_lds: {
1743 Width * 8 * Subtarget->getWavefrontSize());
1749 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1750 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1751 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1752 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1762 Info.memVT = MVT::i32;
1764 Info.align =
Align(4);
1770 case Intrinsic::amdgcn_s_prefetch_data:
1771 case Intrinsic::amdgcn_flat_prefetch:
1772 case Intrinsic::amdgcn_global_prefetch: {
1788 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1791 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1792 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1804 Type *&AccessTy)
const {
1805 Value *Ptr =
nullptr;
1806 switch (
II->getIntrinsicID()) {
1807 case Intrinsic::amdgcn_cluster_load_b128:
1808 case Intrinsic::amdgcn_cluster_load_b64:
1809 case Intrinsic::amdgcn_cluster_load_b32:
1810 case Intrinsic::amdgcn_ds_append:
1811 case Intrinsic::amdgcn_ds_consume:
1812 case Intrinsic::amdgcn_ds_load_tr8_b64:
1813 case Intrinsic::amdgcn_ds_load_tr16_b128:
1814 case Intrinsic::amdgcn_ds_load_tr4_b64:
1815 case Intrinsic::amdgcn_ds_load_tr6_b96:
1816 case Intrinsic::amdgcn_ds_read_tr4_b64:
1817 case Intrinsic::amdgcn_ds_read_tr6_b96:
1818 case Intrinsic::amdgcn_ds_read_tr8_b64:
1819 case Intrinsic::amdgcn_ds_read_tr16_b64:
1820 case Intrinsic::amdgcn_ds_ordered_add:
1821 case Intrinsic::amdgcn_ds_ordered_swap:
1822 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1823 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1824 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1825 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1826 case Intrinsic::amdgcn_global_atomic_fmax_num:
1827 case Intrinsic::amdgcn_global_atomic_fmin_num:
1828 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1829 case Intrinsic::amdgcn_global_load_tr_b64:
1830 case Intrinsic::amdgcn_global_load_tr_b128:
1831 case Intrinsic::amdgcn_global_load_tr4_b64:
1832 case Intrinsic::amdgcn_global_load_tr6_b96:
1833 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1834 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1835 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1836 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1837 Ptr =
II->getArgOperand(0);
1839 case Intrinsic::amdgcn_load_to_lds:
1840 case Intrinsic::amdgcn_load_async_to_lds:
1841 case Intrinsic::amdgcn_global_load_lds:
1842 case Intrinsic::amdgcn_global_load_async_lds:
1843 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1844 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1845 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1846 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1847 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1848 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1849 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1850 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1851 Ptr =
II->getArgOperand(1);
1856 AccessTy =
II->getType();
1862 unsigned AddrSpace)
const {
1863 if (!Subtarget->hasFlatInstOffsets()) {
1874 return AM.
Scale == 0 &&
1875 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1876 AM.
BaseOffs, AddrSpace, FlatVariant));
1880 if (Subtarget->hasFlatGlobalInsts())
1883 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1896 return isLegalMUBUFAddressingMode(AM);
1899bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1910 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1922 if (AM.HasBaseReg) {
1954 return isLegalMUBUFAddressingMode(AM);
1956 if (!Subtarget->hasScalarSubwordLoads()) {
1961 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
2009 return Subtarget->hasFlatScratchEnabled()
2011 : isLegalMUBUFAddressingMode(AM);
2058 unsigned Size,
unsigned AddrSpace,
Align Alignment,
2067 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
2070 Align RequiredAlignment(
2072 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
Size > 32 &&
2073 Alignment < RequiredAlignment)
2088 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2094 RequiredAlignment =
Align(4);
2096 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2112 *IsFast = (Alignment >= RequiredAlignment) ? 64
2113 : (Alignment <
Align(4)) ? 32
2120 if (!Subtarget->hasDS96AndDS128())
2126 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2135 *IsFast = (Alignment >= RequiredAlignment) ? 96
2136 : (Alignment <
Align(4)) ? 32
2143 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2149 RequiredAlignment =
Align(8);
2151 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2160 *IsFast = (Alignment >= RequiredAlignment) ? 128
2161 : (Alignment <
Align(4)) ? 32
2178 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2180 return Alignment >= RequiredAlignment ||
2181 Subtarget->hasUnalignedDSAccessEnabled();
2189 bool AlignedBy4 = Alignment >=
Align(4);
2190 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2192 *IsFast = AlignedBy4 ?
Size : 1;
2197 *IsFast = AlignedBy4;
2208 return Alignment >=
Align(4) ||
2209 Subtarget->hasUnalignedBufferAccessEnabled();
2221 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2236 return Size >= 32 && Alignment >=
Align(4);
2241 unsigned *IsFast)
const {
2243 Alignment, Flags, IsFast);
2248 const AttributeList &FuncAttributes)
const {
2254 if (
Op.size() >= 16 &&
2258 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2276 unsigned DestAS)
const {
2279 Subtarget->hasGloballyAddressableScratch()) {
2309 unsigned Index)
const {
2321 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2326 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2361 auto [InputPtrReg, RC, ArgTy] =
2377 const SDLoc &SL)
const {
2384 const SDLoc &SL)
const {
2387 std::optional<uint32_t> KnownSize =
2389 if (KnownSize.has_value())
2416 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2431SDValue SITargetLowering::lowerKernargMemParameter(
2436 MachinePointerInfo PtrInfo =
2445 int64_t OffsetDiff =
Offset - AlignDownOffset;
2451 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2462 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2467 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2472 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2481 const SDLoc &SL)
const {
2550 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2553 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2554 if (ConvertedVal == ArgValue)
2555 return ConvertedVal;
2560SDValue SITargetLowering::lowerWorkGroupId(
2565 if (!Subtarget->hasClusters())
2566 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2574 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2575 SDLoc SL(ClusterIdXYZ);
2576 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2579 SDValue ClusterWorkGroupIdXYZ =
2580 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2590 return ClusterIdXYZ;
2592 using namespace AMDGPU::Hwreg;
2596 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2607SDValue SITargetLowering::getPreloadedValue(
2610 const ArgDescriptor *
Reg =
nullptr;
2611 const TargetRegisterClass *RC;
2615 const ArgDescriptor WorkGroupIDX =
2623 const ArgDescriptor WorkGroupIDZ =
2625 const ArgDescriptor ClusterWorkGroupIDX =
2627 const ArgDescriptor ClusterWorkGroupIDY =
2629 const ArgDescriptor ClusterWorkGroupIDZ =
2631 const ArgDescriptor ClusterWorkGroupMaxIDX =
2633 const ArgDescriptor ClusterWorkGroupMaxIDY =
2635 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2637 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2640 auto LoadConstant = [&](
unsigned N) {
2644 if (Subtarget->hasArchitectedSGPRs() &&
2651 Reg = &WorkGroupIDX;
2652 RC = &AMDGPU::SReg_32RegClass;
2656 Reg = &WorkGroupIDY;
2657 RC = &AMDGPU::SReg_32RegClass;
2661 Reg = &WorkGroupIDZ;
2662 RC = &AMDGPU::SReg_32RegClass;
2666 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2667 return LoadConstant(0);
2668 Reg = &ClusterWorkGroupIDX;
2669 RC = &AMDGPU::SReg_32RegClass;
2673 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2674 return LoadConstant(0);
2675 Reg = &ClusterWorkGroupIDY;
2676 RC = &AMDGPU::SReg_32RegClass;
2680 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2681 return LoadConstant(0);
2682 Reg = &ClusterWorkGroupIDZ;
2683 RC = &AMDGPU::SReg_32RegClass;
2688 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2689 Reg = &ClusterWorkGroupMaxIDX;
2690 RC = &AMDGPU::SReg_32RegClass;
2695 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2696 Reg = &ClusterWorkGroupMaxIDY;
2697 RC = &AMDGPU::SReg_32RegClass;
2702 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2703 Reg = &ClusterWorkGroupMaxIDZ;
2704 RC = &AMDGPU::SReg_32RegClass;
2708 Reg = &ClusterWorkGroupMaxFlatID;
2709 RC = &AMDGPU::SReg_32RegClass;
2740 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2744 "vector type argument should have been split");
2749 bool SkipArg = !Arg->
Used && !Info->isPSInputAllocated(PSInputNum);
2757 "unexpected vector split in ps argument type");
2771 Info->markPSInputAllocated(PSInputNum);
2773 Info->markPSInputEnabled(PSInputNum);
2789 if (Info.hasWorkItemIDX()) {
2795 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2799 if (Info.hasWorkItemIDY()) {
2800 assert(Info.hasWorkItemIDX());
2801 if (Subtarget->hasPackedTID()) {
2802 Info.setWorkItemIDY(
2805 unsigned Reg = AMDGPU::VGPR1;
2813 if (Info.hasWorkItemIDZ()) {
2814 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2815 if (Subtarget->hasPackedTID()) {
2816 Info.setWorkItemIDZ(
2819 unsigned Reg = AMDGPU::VGPR2;
2839 if (RegIdx == ArgVGPRs.
size()) {
2846 unsigned Reg = ArgVGPRs[RegIdx];
2858 unsigned NumArgRegs) {
2861 if (RegIdx == ArgSGPRs.
size())
2864 unsigned Reg = ArgSGPRs[RegIdx];
2906 const unsigned Mask = 0x3ff;
2909 if (Info.hasWorkItemIDX()) {
2911 Info.setWorkItemIDX(Arg);
2914 if (Info.hasWorkItemIDY()) {
2916 Info.setWorkItemIDY(Arg);
2919 if (Info.hasWorkItemIDZ())
2931 const unsigned Mask = 0x3ff;
2940 auto &
ArgInfo = Info.getArgInfo();
2952 if (Info.hasImplicitArgPtr())
2960 if (Info.hasWorkGroupIDX())
2963 if (Info.hasWorkGroupIDY())
2966 if (Info.hasWorkGroupIDZ())
2969 if (Info.hasLDSKernelId())
2980 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2981 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2987 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2988 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2993 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2994 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
3000 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3006 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
3015 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3020 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
3021 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3026 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
3027 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3042 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3044 bool InPreloadSequence =
true;
3046 bool AlignedForImplictArgs =
false;
3047 unsigned ImplicitArgOffset = 0;
3048 for (
auto &Arg :
F.args()) {
3049 if (!InPreloadSequence || !Arg.hasInRegAttr())
3052 unsigned ArgIdx = Arg.getArgNo();
3055 if (InIdx < Ins.
size() &&
3056 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3059 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
3060 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3062 assert(ArgLocs[ArgIdx].isMemLoc());
3063 auto &ArgLoc = ArgLocs[InIdx];
3065 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3067 unsigned NumAllocSGPRs =
3068 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3071 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
3072 if (!AlignedForImplictArgs) {
3074 alignTo(LastExplicitArgOffset,
3075 Subtarget->getAlignmentForImplicitArgPtr()) -
3076 LastExplicitArgOffset;
3077 AlignedForImplictArgs =
true;
3079 ArgOffset += ImplicitArgOffset;
3083 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3084 assert(InIdx >= 1 &&
"No previous SGPR");
3085 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3086 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3090 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3091 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3094 InPreloadSequence =
false;
3100 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3102 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3104 if (PreloadRegs->
size() > 1)
3105 RC = &AMDGPU::SGPR_32RegClass;
3106 for (
auto &Reg : *PreloadRegs) {
3112 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3121 if (Info.hasLDSKernelId()) {
3122 Register Reg = Info.addLDSKernelId();
3123 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3132 bool IsShader)
const {
3133 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3134 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3140 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3142 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3146 unsigned NumRequiredSystemSGPRs =
3147 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3148 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3149 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3150 Register Reg = Info.addReservedUserSGPR();
3151 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3156 if (!HasArchitectedSGPRs) {
3157 if (Info.hasWorkGroupIDX()) {
3158 Register Reg = Info.addWorkGroupIDX();
3159 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3163 if (Info.hasWorkGroupIDY()) {
3164 Register Reg = Info.addWorkGroupIDY();
3165 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3169 if (Info.hasWorkGroupIDZ()) {
3170 Register Reg = Info.addWorkGroupIDZ();
3171 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3176 if (Info.hasWorkGroupInfo()) {
3177 Register Reg = Info.addWorkGroupInfo();
3178 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3182 if (Info.hasPrivateSegmentWaveByteOffset()) {
3184 unsigned PrivateSegmentWaveByteOffsetReg;
3187 PrivateSegmentWaveByteOffsetReg =
3188 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3192 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3194 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3197 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3199 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3200 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3203 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3204 Info.getNumPreloadedSGPRs() >= 16);
3219 if (HasStackObjects)
3220 Info.setHasNonSpillStackObjects(
true);
3225 HasStackObjects =
true;
3229 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3231 if (!ST.hasFlatScratchEnabled()) {
3232 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3239 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3241 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3251 Info.setScratchRSrcReg(ReservedBufferReg);
3270 if (!MRI.
isLiveIn(AMDGPU::SGPR32)) {
3271 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3278 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3280 Info.setStackPtrOffsetReg(
Reg);
3285 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3292 if (ST.getFrameLowering()->hasFP(MF)) {
3293 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3309 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3318 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3319 RC = &AMDGPU::SGPR_64RegClass;
3320 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3321 RC = &AMDGPU::SGPR_32RegClass;
3327 Entry->addLiveIn(*
I);
3332 for (
auto *Exit : Exits)
3334 TII->get(TargetOpcode::COPY), *
I)
3349 bool IsError =
false;
3353 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3371 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3372 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3374 if (!Subtarget->hasFlatScratchEnabled())
3379 !Subtarget->hasArchitectedSGPRs())
3380 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3381 !Info->hasWorkGroupIDZ());
3384 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3402 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3403 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3406 Info->markPSInputAllocated(0);
3407 Info->markPSInputEnabled(0);
3409 if (Subtarget->isAmdPalOS()) {
3418 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3419 if ((PsInputBits & 0x7F) == 0 ||
3420 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3423 }
else if (IsKernel) {
3424 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3436 if (IsKernel && Subtarget->hasKernargPreload())
3440 }
else if (!IsGraphics) {
3445 if (!Subtarget->hasFlatScratchEnabled())
3457 Info->setNumWaveDispatchSGPRs(
3459 Info->setNumWaveDispatchVGPRs(
3461 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3462 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3467 if (IsWholeWaveFunc) {
3469 {MVT::i1, MVT::Other}, Chain);
3481 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3492 if (IsEntryFunc && VA.
isMemLoc()) {
3515 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3519 int64_t OffsetDiff =
Offset - AlignDownOffset;
3526 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3529 Register VReg = MRI.getLiveInVirtReg(Reg);
3537 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3538 Ins[i].Flags.isSExt(), &Ins[i]);
3546 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3549 if (PreloadRegs.
size() == 1) {
3550 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3555 TRI->getRegSizeInBits(*RC)));
3563 for (
auto Reg : PreloadRegs) {
3564 Register VReg = MRI.getLiveInVirtReg(Reg);
3570 PreloadRegs.size()),
3587 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3588 Ins[i].Flags.isSExt(), &Ins[i]);
3600 "hidden argument in kernel signature was not preloaded",
3606 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3607 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3627 if (!IsEntryFunc && VA.
isMemLoc()) {
3628 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3639 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3640 RC = &AMDGPU::VGPR_32RegClass;
3641 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3642 RC = &AMDGPU::SGPR_32RegClass;
3648 if (Arg.
Flags.
isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3654 ReadFirstLane, Val);
3670 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3679 Info->setBytesInStackArgArea(StackArgSize);
3681 return Chains.
empty() ? Chain
3690 const Type *RetTy)
const {
3698 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3703 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3704 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3705 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3706 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3729 Info->setIfReturnsVoid(Outs.
empty());
3730 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3749 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3750 ++
I, ++RealRVLocIdx) {
3754 SDValue Arg = OutVals[RealRVLocIdx];
3777 ReadFirstLane, Arg);
3784 if (!Info->isEntryFunction()) {
3790 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3792 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3805 unsigned Opc = AMDGPUISD::ENDPGM;
3807 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3808 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3809 : AMDGPUISD::RET_GLUE;
3914 const auto [OutgoingArg, ArgRC, ArgTy] =
3919 const auto [IncomingArg, IncomingArgRC, Ty] =
3921 assert(IncomingArgRC == ArgRC);
3924 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3932 InputReg = getImplicitArgPtr(DAG,
DL);
3934 std::optional<uint32_t> Id =
3936 if (Id.has_value()) {
3947 if (OutgoingArg->isRegister()) {
3948 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3949 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3952 unsigned SpecialArgOffset =
3963 auto [OutgoingArg, ArgRC, Ty] =
3966 std::tie(OutgoingArg, ArgRC, Ty) =
3969 std::tie(OutgoingArg, ArgRC, Ty) =
3984 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3985 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3986 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3991 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3999 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
4009 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
4018 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4019 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4030 : IncomingArgY ? *IncomingArgY
4037 if (OutgoingArg->isRegister()) {
4039 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4065 if (Callee->isDivergent())
4072 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4076 if (!CallerPreserved)
4079 bool CCMatch = CallerCC == CalleeCC;
4092 if (Arg.hasByValAttr())
4106 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4107 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4116 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4129 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4131 if (!CCVA.isRegLoc())
4136 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4138 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4162enum ChainCallArgIdx {
4184 bool UsesDynamicVGPRs =
false;
4185 if (IsChainCallConv) {
4190 auto RequestedExecIt =
4192 return Arg.OrigArgIndex == 2;
4194 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4196 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4199 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4202 "Haven't popped all the special args");
4205 CLI.
Args[ChainCallArgIdx::Exec];
4206 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4214 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4216 ChainCallSpecialArgs.
push_back(Arg.Node);
4219 PushNodeOrTargetConstant(RequestedExecArg);
4225 if (FlagsValue.
isZero()) {
4226 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4228 "no additional args allowed if flags == 0");
4230 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4234 if (!Subtarget->isWave32()) {
4236 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4239 UsesDynamicVGPRs =
true;
4240 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4241 CLI.
Args.end(), PushNodeOrTargetConstant);
4250 bool IsSibCall =
false;
4264 "unsupported call to variadic function ");
4272 "unsupported required tail call to function ");
4277 Outs, OutVals, Ins, DAG);
4281 "site marked musttail or on llvm.amdgcn.cs.chain");
4288 if (!TailCallOpt && IsTailCall)
4312 if (!Subtarget->hasFlatScratchEnabled())
4333 auto *
TRI = Subtarget->getRegisterInfo();
4340 if (!IsSibCall || IsChainCallConv) {
4341 if (!Subtarget->hasFlatScratchEnabled()) {
4347 RegsToPass.emplace_back(IsChainCallConv
4348 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4349 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4356 const unsigned NumSpecialInputs = RegsToPass.size();
4358 MVT PtrVT = MVT::i32;
4361 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4389 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4397 int32_t
Offset = LocMemOffset;
4404 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4410 ? Flags.getNonZeroByValAlign()
4437 if (Outs[i].Flags.isByVal()) {
4439 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4442 Outs[i].Flags.getNonZeroByValAlign(),
4444 nullptr, std::nullopt, DstInfo,
4450 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4456 if (!MemOpChains.
empty())
4472 unsigned ArgIdx = 0;
4473 for (
auto [Reg, Val] : RegsToPass) {
4474 if (ArgIdx++ >= NumSpecialInputs &&
4475 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4501 if (IsTailCall && !IsSibCall) {
4506 std::vector<SDValue>
Ops({Chain});
4512 Ops.push_back(Callee);
4529 Ops.push_back(Callee);
4540 if (IsChainCallConv)
4545 for (
auto &[Reg, Val] : RegsToPass)
4549 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4550 assert(Mask &&
"Missing call preserved mask for calling convention");
4560 MVT::Glue, GlueOps),
4565 Ops.push_back(InGlue);
4571 unsigned OPC = AMDGPUISD::TC_RETURN;
4574 OPC = AMDGPUISD::TC_RETURN_GFX;
4578 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4579 : AMDGPUISD::TC_RETURN_CHAIN;
4585 if (Info->isWholeWaveFunction())
4586 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4593 Chain =
Call.getValue(0);
4594 InGlue =
Call.getValue(1);
4596 uint64_t CalleePopBytes = NumBytes;
4617 EVT VT =
Op.getValueType();
4631 "Stack grows upwards for AMDGPU");
4633 Chain = BaseAddr.getValue(1);
4635 if (Alignment > StackAlign) {
4637 << Subtarget->getWavefrontSizeLog2();
4638 uint64_t StackAlignMask = ScaledAlignment - 1;
4645 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4651 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4662 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4678 if (
Op.getValueType() != MVT::i32)
4697 assert(
Op.getValueType() == MVT::i32);
4706 Op.getOperand(0), IntrinID, GetRoundBothImm);
4740 SDValue RoundModeTimesNumBits =
4760 TableEntry, EnumOffset);
4776 static_cast<uint32_t>(ConstMode->getZExtValue()),
4788 if (UseReducedTable) {
4794 SDValue RoundModeTimesNumBits =
4814 SDValue RoundModeTimesNumBits =
4823 NewMode = TruncTable;
4832 ReadFirstLaneID, NewMode);
4845 IntrinID, RoundBothImm, NewMode);
4851 if (
Op->isDivergent() &&
4852 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4862 if (Subtarget->hasSafeSmemPrefetch())
4870 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4879 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4880 EVT SrcVT = Src.getValueType();
4889 EVT DstVT =
Op.getValueType();
4898 if (
Op.getValueType() != MVT::i64)
4912 Op.getOperand(0), IntrinID, ModeHwRegImm);
4914 Op.getOperand(0), IntrinID, TrapHwRegImm);
4928 if (
Op.getOperand(1).getValueType() != MVT::i64)
4940 ReadFirstLaneID, NewModeReg);
4942 ReadFirstLaneID, NewTrapReg);
4944 unsigned ModeHwReg =
4947 unsigned TrapHwReg =
4955 IntrinID, ModeHwRegImm, NewModeReg);
4958 IntrinID, TrapHwRegImm, NewTrapReg);
4967 .
Case(
"m0", AMDGPU::M0)
4968 .
Case(
"exec", AMDGPU::EXEC)
4969 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4970 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4971 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4972 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4973 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4978 if (!Subtarget->hasFlatScrRegister() &&
4979 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4981 "\" for subtarget."));
4986 case AMDGPU::EXEC_LO:
4987 case AMDGPU::EXEC_HI:
4988 case AMDGPU::FLAT_SCR_LO:
4989 case AMDGPU::FLAT_SCR_HI:
4994 case AMDGPU::FLAT_SCR:
5013 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
5022static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5044 auto Next = std::next(
I);
5055 MBB.addSuccessor(LoopBB);
5057 return std::pair(LoopBB, RemainderBB);
5064 auto I =
MI.getIterator();
5065 auto E = std::next(
I);
5087 Src->setIsKill(
false);
5097 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5106 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5130 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5131 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5153 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5160 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5164 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5174 if (UseGPRIdxMode) {
5176 SGPRIdxReg = CurrentIdxReg;
5179 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5189 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5220 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5221 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5229 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5248 InitResultReg, DstReg, PhiReg, TmpExec,
5249 Offset, UseGPRIdxMode, SGPRIdxReg);
5255 LoopBB->removeSuccessor(RemainderBB);
5257 LoopBB->addSuccessor(LandingPad);
5268static std::pair<unsigned, int>
5272 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5277 return std::pair(AMDGPU::sub0,
Offset);
5334 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5335 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5341 std::tie(SubReg,
Offset) =
5344 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5347 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5351 if (UseGPRIdxMode) {
5358 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5367 .
addReg(SrcReg, {}, SubReg)
5371 MI.eraseFromParent();
5387 UseGPRIdxMode, SGPRIdxReg);
5391 if (UseGPRIdxMode) {
5393 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5395 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5400 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5401 .
addReg(SrcReg, {}, SubReg)
5405 MI.eraseFromParent();
5422 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5430 std::tie(SubReg,
Offset) =
5432 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5434 if (Idx->
getReg() == AMDGPU::NoRegister) {
5445 MI.eraseFromParent();
5450 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5454 if (UseGPRIdxMode) {
5458 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5467 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5468 TRI.getRegSizeInBits(*VecRC), 32,
false);
5474 MI.eraseFromParent();
5488 UseGPRIdxMode, SGPRIdxReg);
5491 if (UseGPRIdxMode) {
5493 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5495 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5501 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5502 TRI.getRegSizeInBits(*VecRC), 32,
false);
5503 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5509 MI.eraseFromParent();
5525 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5526 if (ST.hasScalarAddSub64()) {
5527 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5537 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5538 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5541 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5543 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5546 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5548 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5550 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5551 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5560 MI.eraseFromParent();
5574 Register SrcCond =
MI.getOperand(3).getReg();
5582 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src0);
5584 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src1);
5586 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), Src0Idx));
5588 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), Src1Idx));
5591 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5593 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5596 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5598 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5601 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5603 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5625 MI.eraseFromParent();
5630 case AMDGPU::S_MIN_U32:
5631 return std::numeric_limits<uint32_t>::max();
5632 case AMDGPU::S_MIN_I32:
5633 return std::numeric_limits<int32_t>::max();
5634 case AMDGPU::S_MAX_U32:
5635 return std::numeric_limits<uint32_t>::min();
5636 case AMDGPU::S_MAX_I32:
5637 return std::numeric_limits<int32_t>::min();
5638 case AMDGPU::V_ADD_F32_e64:
5640 case AMDGPU::V_SUB_F32_e64:
5642 case AMDGPU::S_ADD_I32:
5643 case AMDGPU::S_SUB_I32:
5644 case AMDGPU::S_OR_B32:
5645 case AMDGPU::S_XOR_B32:
5646 return std::numeric_limits<uint32_t>::min();
5647 case AMDGPU::S_AND_B32:
5648 return std::numeric_limits<uint32_t>::max();
5649 case AMDGPU::V_MIN_F32_e64:
5650 case AMDGPU::V_MAX_F32_e64:
5652 case AMDGPU::V_CMP_LT_U64_e64:
5653 return std::numeric_limits<uint64_t>::max();
5654 case AMDGPU::V_CMP_LT_I64_e64:
5655 return std::numeric_limits<int64_t>::max();
5656 case AMDGPU::V_CMP_GT_U64_e64:
5657 return std::numeric_limits<uint64_t>::min();
5658 case AMDGPU::V_CMP_GT_I64_e64:
5659 return std::numeric_limits<int64_t>::min();
5660 case AMDGPU::V_MIN_F64_e64:
5661 case AMDGPU::V_MAX_F64_e64:
5662 case AMDGPU::V_MIN_NUM_F64_e64:
5663 case AMDGPU::V_MAX_NUM_F64_e64:
5664 return 0x7FF8000000000000;
5665 case AMDGPU::S_ADD_U64_PSEUDO:
5666 case AMDGPU::S_SUB_U64_PSEUDO:
5667 case AMDGPU::S_OR_B64:
5668 case AMDGPU::S_XOR_B64:
5669 return std::numeric_limits<uint64_t>::min();
5670 case AMDGPU::S_AND_B64:
5671 return std::numeric_limits<uint64_t>::max();
5672 case AMDGPU::V_ADD_F64_e64:
5673 case AMDGPU::V_ADD_F64_pseudo_e64:
5674 return 0x8000000000000000;
5681 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5682 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5683 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5684 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5685 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5686 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5687 Opc == AMDGPU::V_SUB_F32_e64;
5691 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5692 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64 ||
5693 Opc == AMDGPU::V_MIN_F64_e64 ||
Opc == AMDGPU::V_MAX_F64_e64 ||
5694 Opc == AMDGPU::V_MIN_NUM_F64_e64 ||
Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5695 Opc == AMDGPU::V_ADD_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5698static std::tuple<unsigned, unsigned>
5702 case AMDGPU::S_MIN_U32:
5703 DPPOpc = AMDGPU::V_MIN_U32_dpp;
5705 case AMDGPU::S_MIN_I32:
5706 DPPOpc = AMDGPU::V_MIN_I32_dpp;
5708 case AMDGPU::S_MAX_U32:
5709 DPPOpc = AMDGPU::V_MAX_U32_dpp;
5711 case AMDGPU::S_MAX_I32:
5712 DPPOpc = AMDGPU::V_MAX_I32_dpp;
5714 case AMDGPU::S_ADD_I32:
5715 case AMDGPU::S_SUB_I32:
5716 DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5717 : AMDGPU::V_ADD_CO_U32_dpp;
5719 case AMDGPU::S_AND_B32:
5720 DPPOpc = AMDGPU::V_AND_B32_dpp;
5722 case AMDGPU::S_OR_B32:
5723 DPPOpc = AMDGPU::V_OR_B32_dpp;
5725 case AMDGPU::S_XOR_B32:
5726 DPPOpc = AMDGPU::V_XOR_B32_dpp;
5728 case AMDGPU::V_ADD_F32_e64:
5729 case AMDGPU::V_SUB_F32_e64:
5730 DPPOpc = AMDGPU::V_ADD_F32_dpp;
5732 case AMDGPU::V_MIN_F32_e64:
5733 DPPOpc = AMDGPU::V_MIN_F32_dpp;
5735 case AMDGPU::V_MAX_F32_e64:
5736 DPPOpc = AMDGPU::V_MAX_F32_dpp;
5738 case AMDGPU::V_CMP_LT_U64_e64:
5739 case AMDGPU::V_CMP_LT_I64_e64:
5740 case AMDGPU::V_CMP_GT_U64_e64:
5741 case AMDGPU::V_CMP_GT_I64_e64:
5742 case AMDGPU::S_ADD_U64_PSEUDO:
5743 case AMDGPU::S_SUB_U64_PSEUDO:
5744 case AMDGPU::S_AND_B64:
5745 case AMDGPU::S_OR_B64:
5746 case AMDGPU::S_XOR_B64:
5747 case AMDGPU::V_MIN_NUM_F64_e64:
5748 case AMDGPU::V_MIN_F64_e64:
5749 case AMDGPU::V_MAX_NUM_F64_e64:
5750 case AMDGPU::V_MAX_F64_e64:
5751 case AMDGPU::V_ADD_F64_pseudo_e64:
5752 case AMDGPU::V_ADD_F64_e64:
5753 DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
5758 unsigned ClampOpc =
Opc;
5759 if (!ST.getInstrInfo()->isVALU(
Opc)) {
5760 if (
Opc == AMDGPU::S_SUB_I32)
5761 ClampOpc = AMDGPU::S_ADD_I32;
5762 if (
Opc == AMDGPU::S_ADD_U64_PSEUDO ||
Opc == AMDGPU::S_SUB_U64_PSEUDO)
5763 ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
5764 else if (
Opc == AMDGPU::S_AND_B64)
5765 ClampOpc = AMDGPU::V_AND_B32_e64;
5766 else if (
Opc == AMDGPU::S_OR_B64)
5767 ClampOpc = AMDGPU::V_OR_B32_e64;
5768 else if (
Opc == AMDGPU::S_XOR_B64)
5769 ClampOpc = AMDGPU::V_XOR_B32_e64;
5771 ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
5773 return {DPPOpc, ClampOpc};
5776static std::pair<Register, Register>
5783 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5785 TII->buildExtractSubReg(
MI, MRI,
Op, SrcRC, AMDGPU::sub0, SrcSubRC);
5787 TII->buildExtractSubReg(
MI, MRI,
Op, SrcRC, AMDGPU::sub1, SrcSubRC);
5788 return {Op1L, Op1H};
5804 unsigned Stratergy =
static_cast<unsigned>(
MI.getOperand(2).
getImm());
5805 enum WAVE_REDUCE_STRATEGY :
unsigned {
DEFAULT = 0, ITERATIVE = 1,
DPP = 2 };
5807 unsigned MIOpc =
MI.getOpcode();
5821 case AMDGPU::S_MIN_U32:
5822 case AMDGPU::S_MIN_I32:
5823 case AMDGPU::V_MIN_F32_e64:
5824 case AMDGPU::S_MAX_U32:
5825 case AMDGPU::S_MAX_I32:
5826 case AMDGPU::V_MAX_F32_e64:
5827 case AMDGPU::S_AND_B32:
5828 case AMDGPU::S_OR_B32: {
5834 case AMDGPU::V_CMP_LT_U64_e64:
5835 case AMDGPU::V_CMP_LT_I64_e64:
5836 case AMDGPU::V_CMP_GT_U64_e64:
5837 case AMDGPU::V_CMP_GT_I64_e64:
5838 case AMDGPU::V_MIN_F64_e64:
5839 case AMDGPU::V_MIN_NUM_F64_e64:
5840 case AMDGPU::V_MAX_F64_e64:
5841 case AMDGPU::V_MAX_NUM_F64_e64:
5842 case AMDGPU::S_AND_B64:
5843 case AMDGPU::S_OR_B64: {
5849 case AMDGPU::S_XOR_B32:
5850 case AMDGPU::S_XOR_B64:
5851 case AMDGPU::S_ADD_I32:
5852 case AMDGPU::S_ADD_U64_PSEUDO:
5853 case AMDGPU::V_ADD_F32_e64:
5854 case AMDGPU::V_ADD_F64_e64:
5855 case AMDGPU::V_ADD_F64_pseudo_e64:
5856 case AMDGPU::S_SUB_I32:
5857 case AMDGPU::S_SUB_U64_PSEUDO:
5858 case AMDGPU::V_SUB_F32_e64: {
5865 bool IsWave32 = ST.isWave32();
5866 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5867 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5868 unsigned BitCountOpc =
5869 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5873 auto NewAccumulator =
5878 case AMDGPU::S_XOR_B32:
5879 case AMDGPU::S_XOR_B64: {
5888 .
addReg(NewAccumulator->getOperand(0).getReg())
5891 if (
Opc == AMDGPU::S_XOR_B32) {
5908 BuildRegSequence(BB,
MI, DstReg, DestSub0, DestSub1);
5912 case AMDGPU::S_SUB_I32: {
5921 .
addReg(NewAccumulator->getOperand(0).getReg());
5924 case AMDGPU::S_ADD_I32: {
5927 .
addReg(NewAccumulator->getOperand(0).getReg());
5930 case AMDGPU::S_ADD_U64_PSEUDO:
5931 case AMDGPU::S_SUB_U64_PSEUDO: {
5946 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5949 .
addReg(NewAccumulator->getOperand(0).getReg())
5959 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5961 : NewAccumulator->getOperand(0).getReg();
5972 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5978 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5984 BuildRegSequence(BB,
MI, DstReg, DestSub0, DestSub1);
5987 case AMDGPU::V_ADD_F32_e64:
5988 case AMDGPU::V_ADD_F64_e64:
5989 case AMDGPU::V_ADD_F64_pseudo_e64:
5990 case AMDGPU::V_SUB_F32_e64: {
5997 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5998 : AMDGPU::V_CVT_F64_I32_e64),
6000 .
addReg(NewAccumulator->getOperand(0).getReg())
6005 unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6006 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
6009 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
6011 ? AMDGPU::V_MUL_F64_pseudo_e64
6012 : AMDGPU::V_MUL_F64_e64;
6022 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6039 BuildRegSequence(BB,
MI, DstReg, LaneValueLoReg, LaneValueHiReg);
6051 bool NeedsMovDPP = !is32BitOpc;
6056 bool IsWave32 = ST.isWave32();
6057 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6058 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6059 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
6085 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6089 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6090 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6099 I = ComputeLoop->begin();
6101 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
6105 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
6109 I = ComputeLoop->end();
6113 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6117 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6126 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
6138 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6155 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6159 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6163 auto LaneValue = BuildRegSequence(*ComputeLoop,
I, LaneValReg,
6164 LaneValueLoReg, LaneValueHiReg);
6166 case AMDGPU::S_OR_B64:
6167 case AMDGPU::S_AND_B64:
6168 case AMDGPU::S_XOR_B64: {
6171 .
addReg(LaneValue->getOperand(0).getReg())
6175 case AMDGPU::V_CMP_GT_I64_e64:
6176 case AMDGPU::V_CMP_GT_U64_e64:
6177 case AMDGPU::V_CMP_LT_I64_e64:
6178 case AMDGPU::V_CMP_LT_U64_e64: {
6183 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6185 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6189 BuildRegSequence(*ComputeLoop,
I, AccumulatorVReg, SrcReg0Sub0,
6192 .
addReg(LaneValue->getOperand(0).getReg())
6193 .
addReg(AccumulatorVReg);
6195 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6196 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
6200 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6201 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6202 .
addReg(LaneValue->getOperand(0).getReg())
6206 case AMDGPU::V_MIN_F64_e64:
6207 case AMDGPU::V_MIN_NUM_F64_e64:
6208 case AMDGPU::V_MAX_F64_e64:
6209 case AMDGPU::V_MAX_NUM_F64_e64:
6210 case AMDGPU::V_ADD_F64_e64:
6211 case AMDGPU::V_ADD_F64_pseudo_e64: {
6213 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6215 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6222 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), AccumulatorVReg)
6225 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6231 .
addReg(LaneValue->getOperand(0).getReg())
6238 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6241 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6243 auto [Op1L, Op1H] =
ExtractSubRegs(*Iters, DstVregInst->getOperand(0),
6245 ReadLaneLo.addReg(Op1L);
6246 ReadLaneHi.addReg(Op1H);
6248 BuildRegSequence(*ComputeLoop,
I, DstReg, LaneValLo, LaneValHi);
6251 case AMDGPU::S_ADD_U64_PSEUDO:
6252 case AMDGPU::S_SUB_U64_PSEUDO: {
6255 .
addReg(LaneValue->getOperand(0).getReg());
6263 unsigned BITSETOpc =
6264 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6265 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
6271 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6274 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6276 .
addReg(NewActiveBitsReg)
6278 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
6283 assert(ST.hasDPP() &&
"Sub Target does not support DPP Operations");
6300 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
6304 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6305 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6308 auto IdentityCopyInstr =
6312 unsigned DPPOpc = std::get<0>(DPPClampOpcPair);
6313 unsigned ClampOpc = std::get<1>(DPPClampOpcPair);
6328 if (isFPOp && !NeedsMovDPP)
6331 if (isFPOp && !NeedsMovDPP)
6335 if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
6344 bool isAddSub =
false,
6345 bool needsCarryIn =
false,
6347 unsigned InstrOpc = ClampOpc;
6350 InstrOpc = AMDGPU::V_ADDC_U32_e64;
6351 auto ClampInstr =
BuildMI(*CurrBB,
MI,
DL,
TII->get(InstrOpc), Dst);
6356 ClampInstr.addReg(CarryOutReg,
6362 ClampInstr.addReg(Src0);
6365 ClampInstr.addReg(Src1);
6368 if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
6369 ClampInstr.addImm(0);
6371 ClampInstr.addImm(0);
6372 LastBcastInstr = ClampInstr;
6377 Opc == AMDGPU::S_ADD_U64_PSEUDO ||
Opc == AMDGPU::S_SUB_U64_PSEUDO;
6378 bool isBitWiseOpc =
Opc == AMDGPU::S_AND_B64 ||
6379 Opc == AMDGPU::S_OR_B64 ||
Opc == AMDGPU::S_XOR_B64;
6381 if (isAddSubOpc || isBitWiseOpc) {
6388 auto [Src0Lo, Src0Hi] =
6390 auto [Src1Lo, Src1Hi] =
6392 Register CarryReg = BuildClampInstr(
6393 ResLo, Src0Lo, Src1Lo, isAddSubOpc,
false);
6394 BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,
6395 isAddSubOpc, CarryReg);
6396 BuildRegSequence(*CurrBB,
MI, ReturnReg, ResLo, ResHi);
6425 SrcWithIdentityInstr =
6426 BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
6433 MI, IdentityCopyInstr->getOperand(0), SrcRegClass, ST, MRI);
6434 auto [SrcReg0Sub0, SrcReg0Sub1] =
6437 BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);
6439 BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);
6440 SrcWithIdentityInstr =
6441 BuildRegSequence(*CurrBB,
MI, SrcWithIdentity,
6448 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
6451 DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
6453 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6456 DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
6458 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6461 DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
6463 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6466 DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
6468 if (ST.hasDPPBroadcasts()) {
6471 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
6486 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
6507 BuildRegSequence(*CurrBB,
MI, SwizzledValue64, SwizzledValuelo,
6510 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
6512 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
6515 FinalDPPResult = RowBcast15;
6517 if (ST.hasDPPBroadcasts()) {
6520 RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
6536 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
6540 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
6546 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
6554 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
6559 .
addReg(ShiftedThreadID);
6564 .
addReg(PermuteByteOffset)
6574 auto [RowBcast15Lo, RowBcast15Hi] =
6578 .
addReg(PermuteByteOffset)
6583 .
addReg(PermuteByteOffset)
6586 BuildRegSequence(*CurrBB,
MI, PermutedValue, PermutedValuelo,
6590 RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
6592 BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
6594 FinalDPPResult = RowBcast31;
6596 if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6597 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
6607 .
addReg(IsWave32 ? RowBcast15 : RowBcast31)
6610 FinalDPPResult = NegatedValVGPR;
6617 .
addImm(ST.getWavefrontSize() - 1);
6632 .
addImm(ST.getWavefrontSize() - 1);
6636 .
addImm(ST.getWavefrontSize() - 1);
6637 BuildRegSequence(*CurrBB,
MI, ReducedValSGPR, LaneValueLoReg,
6640 if (
Opc == AMDGPU::S_SUB_I32) {
6641 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6644 }
else if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6645 auto NegatedValInstr =
6653 .
addReg(
Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_SUB_U64_PSEUDO
6659 MI.eraseFromParent();
6674 switch (
MI.getOpcode()) {
6675 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6677 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6679 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6681 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6683 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6685 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6688 ? AMDGPU::V_MIN_NUM_F64_e64
6689 : AMDGPU::V_MIN_F64_e64);
6690 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6692 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6694 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6696 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6698 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6700 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6703 ? AMDGPU::V_MAX_NUM_F64_e64
6704 : AMDGPU::V_MAX_F64_e64);
6705 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6707 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6709 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6711 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6714 ? AMDGPU::V_ADD_F64_pseudo_e64
6715 : AMDGPU::V_ADD_F64_e64);
6716 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6718 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6720 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6722 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6727 ? AMDGPU::V_ADD_F64_pseudo_e64
6728 : AMDGPU::V_ADD_F64_e64);
6729 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6731 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6733 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6735 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6737 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6739 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6741 case AMDGPU::S_UADDO_PSEUDO:
6742 case AMDGPU::S_USUBO_PSEUDO: {
6748 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6750 : AMDGPU::S_SUB_U32;
6758 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6761 MI.eraseFromParent();
6764 case AMDGPU::S_ADD_U64_PSEUDO:
6765 case AMDGPU::S_SUB_U64_PSEUDO: {
6768 case AMDGPU::V_ADD_U64_PSEUDO:
6769 case AMDGPU::V_SUB_U64_PSEUDO: {
6770 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6776 if (ST.hasAddSubU64Insts()) {
6778 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6779 : AMDGPU::V_SUB_U64_e64),
6784 TII->legalizeOperands(*
I);
6785 MI.eraseFromParent();
6789 if (IsAdd && ST.hasLshlAddU64Inst()) {
6795 TII->legalizeOperands(*
Add);
6796 MI.eraseFromParent();
6800 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6810 : &AMDGPU::VReg_64RegClass;
6813 : &AMDGPU::VReg_64RegClass;
6816 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6818 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6821 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6823 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6826 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6828 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6831 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6838 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6852 TII->legalizeOperands(*LoHalf);
6853 TII->legalizeOperands(*HiHalf);
6854 MI.eraseFromParent();
6857 case AMDGPU::S_ADD_CO_PSEUDO:
6858 case AMDGPU::S_SUB_CO_PSEUDO: {
6870 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6876 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6881 if (
TRI->isVectorRegister(MRI, Src2.
getReg())) {
6882 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6887 if (ST.isWave64()) {
6888 if (ST.hasScalarCompareEq64()) {
6895 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6897 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6899 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6902 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6916 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6917 ? AMDGPU::S_ADDC_U32
6918 : AMDGPU::S_SUBB_U32;
6923 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6929 MI.eraseFromParent();
6932 case AMDGPU::SI_INIT_M0: {
6935 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6938 MI.eraseFromParent();
6941 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6944 TII->get(AMDGPU::S_CMP_EQ_U32))
6949 case AMDGPU::GET_GROUPSTATICSIZE: {
6953 .
add(
MI.getOperand(0))
6955 MI.eraseFromParent();
6958 case AMDGPU::GET_SHADERCYCLESHILO: {
6973 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6976 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6979 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6988 .
add(
MI.getOperand(0))
6993 MI.eraseFromParent();
6996 case AMDGPU::SI_INDIRECT_SRC_V1:
6997 case AMDGPU::SI_INDIRECT_SRC_V2:
6998 case AMDGPU::SI_INDIRECT_SRC_V3:
6999 case AMDGPU::SI_INDIRECT_SRC_V4:
7000 case AMDGPU::SI_INDIRECT_SRC_V5:
7001 case AMDGPU::SI_INDIRECT_SRC_V6:
7002 case AMDGPU::SI_INDIRECT_SRC_V7:
7003 case AMDGPU::SI_INDIRECT_SRC_V8:
7004 case AMDGPU::SI_INDIRECT_SRC_V9:
7005 case AMDGPU::SI_INDIRECT_SRC_V10:
7006 case AMDGPU::SI_INDIRECT_SRC_V11:
7007 case AMDGPU::SI_INDIRECT_SRC_V12:
7008 case AMDGPU::SI_INDIRECT_SRC_V16:
7009 case AMDGPU::SI_INDIRECT_SRC_V32:
7011 case AMDGPU::SI_INDIRECT_DST_V1:
7012 case AMDGPU::SI_INDIRECT_DST_V2:
7013 case AMDGPU::SI_INDIRECT_DST_V3:
7014 case AMDGPU::SI_INDIRECT_DST_V4:
7015 case AMDGPU::SI_INDIRECT_DST_V5:
7016 case AMDGPU::SI_INDIRECT_DST_V6:
7017 case AMDGPU::SI_INDIRECT_DST_V7:
7018 case AMDGPU::SI_INDIRECT_DST_V8:
7019 case AMDGPU::SI_INDIRECT_DST_V9:
7020 case AMDGPU::SI_INDIRECT_DST_V10:
7021 case AMDGPU::SI_INDIRECT_DST_V11:
7022 case AMDGPU::SI_INDIRECT_DST_V12:
7023 case AMDGPU::SI_INDIRECT_DST_V16:
7024 case AMDGPU::SI_INDIRECT_DST_V32:
7026 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7027 case AMDGPU::SI_KILL_I1_PSEUDO:
7029 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
7033 case AMDGPU::SI_BR_UNDEF: {
7035 .
add(
MI.getOperand(0));
7037 MI.eraseFromParent();
7040 case AMDGPU::ADJCALLSTACKUP:
7041 case AMDGPU::ADJCALLSTACKDOWN: {
7048 case AMDGPU::SI_CALL_ISEL: {
7049 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
7052 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
7058 MI.eraseFromParent();
7061 case AMDGPU::V_ADD_CO_U32_e32:
7062 case AMDGPU::V_SUB_CO_U32_e32:
7063 case AMDGPU::V_SUBREV_CO_U32_e32: {
7065 unsigned Opc =
MI.getOpcode();
7067 bool NeedClampOperand =
false;
7068 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
7070 NeedClampOperand =
true;
7074 if (
TII->isVOP3(*
I)) {
7077 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
7078 if (NeedClampOperand)
7081 TII->legalizeOperands(*
I);
7083 MI.eraseFromParent();
7086 case AMDGPU::V_ADDC_U32_e32:
7087 case AMDGPU::V_SUBB_U32_e32:
7088 case AMDGPU::V_SUBBREV_U32_e32:
7091 TII->legalizeOperands(
MI);
7093 case AMDGPU::DS_GWS_INIT:
7094 case AMDGPU::DS_GWS_SEMA_BR:
7095 case AMDGPU::DS_GWS_BARRIER:
7096 case AMDGPU::DS_GWS_SEMA_V:
7097 case AMDGPU::DS_GWS_SEMA_P:
7098 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
7106 case AMDGPU::S_SETREG_B32: {
7122 const unsigned SetMask = WidthMask <<
Offset;
7125 unsigned SetDenormOp = 0;
7126 unsigned SetRoundOp = 0;
7134 SetRoundOp = AMDGPU::S_ROUND_MODE;
7135 SetDenormOp = AMDGPU::S_DENORM_MODE;
7137 SetRoundOp = AMDGPU::S_ROUND_MODE;
7139 SetDenormOp = AMDGPU::S_DENORM_MODE;
7142 if (SetRoundOp || SetDenormOp) {
7144 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
7145 unsigned ImmVal = Def->getOperand(1).getImm();
7159 MI.eraseFromParent();
7168 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
7172 case AMDGPU::S_INVERSE_BALLOT_U32:
7173 case AMDGPU::S_INVERSE_BALLOT_U64:
7176 MI.setDesc(
TII->get(AMDGPU::COPY));
7178 case AMDGPU::ENDPGM_TRAP: {
7180 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
7200 MI.eraseFromParent();
7203 case AMDGPU::SIMULATED_TRAP: {
7204 assert(Subtarget->hasPrivEnabledTrap2NopBug());
7206 TII->insertSimulatedTrap(MRI, *BB,
MI,
MI.getDebugLoc());
7207 MI.eraseFromParent();
7210 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
7211 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
7217 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
7218 Register OriginalExec = Setup->getOperand(0).getReg();
7220 MI.getOperand(0).setReg(OriginalExec);
7257 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7261 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7288 if (!Subtarget->hasMadMacF32Insts())
7289 return Subtarget->hasFastFMAF32();
7295 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7298 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7314 switch (Ty.getScalarSizeInBits()) {
7332 if (Ty.getScalarSizeInBits() == 16)
7334 if (Ty.getScalarSizeInBits() == 32)
7335 return Subtarget->hasMadMacF32Insts() &&
7345 EVT VT =
N->getValueType(0);
7347 return Subtarget->hasMadMacF32Insts() &&
7349 if (VT == MVT::f16) {
7350 return Subtarget->hasMadF16() &&
7365 unsigned Opc =
Op.getOpcode();
7366 EVT VT =
Op.getValueType();
7367 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7368 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7369 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7370 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7371 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7372 VT == MVT::v32bf16);
7388 [[maybe_unused]]
EVT VT =
Op.getValueType();
7390 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7391 VT == MVT::v16i32) &&
7392 "Unexpected ValueType.");
7401 unsigned Opc =
Op.getOpcode();
7402 EVT VT =
Op.getValueType();
7403 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7404 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7405 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7406 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7407 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7408 VT == MVT::v32bf16);
7416 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
7418 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
7425 unsigned Opc =
Op.getOpcode();
7426 EVT VT =
Op.getValueType();
7427 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7428 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7429 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7430 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7431 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7432 VT == MVT::v32bf16);
7437 : std::pair(Op0, Op0);
7446 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
7448 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
7454 switch (
Op.getOpcode()) {
7458 return LowerBRCOND(
Op, DAG);
7460 return LowerRETURNADDR(
Op, DAG);
7462 return LowerSPONENTRY(
Op, DAG);
7465 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7466 "Load should return a value and a chain");
7470 EVT VT =
Op.getValueType();
7472 return lowerFSQRTF32(
Op, DAG);
7474 return lowerFSQRTF64(
Op, DAG);
7479 return LowerTrig(
Op, DAG);
7481 return LowerSELECT(
Op, DAG);
7483 return LowerFDIV(
Op, DAG);
7485 return LowerFFREXP(
Op, DAG);
7487 return LowerATOMIC_CMP_SWAP(
Op, DAG);
7489 return LowerSTORE(
Op, DAG);
7493 return LowerGlobalAddress(MFI,
Op, DAG);
7496 return LowerExternalSymbol(
Op, DAG);
7498 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
7500 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
7502 return LowerINTRINSIC_VOID(
Op, DAG);
7504 return lowerADDRSPACECAST(
Op, DAG);
7506 return lowerINSERT_SUBVECTOR(
Op, DAG);
7508 return lowerINSERT_VECTOR_ELT(
Op, DAG);
7510 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
7512 return lowerVECTOR_SHUFFLE(
Op, DAG);
7514 return lowerSCALAR_TO_VECTOR(
Op, DAG);
7516 return lowerBUILD_VECTOR(
Op, DAG);
7519 return lowerFP_ROUND(
Op, DAG);
7521 return lowerTRAP(
Op, DAG);
7523 return lowerDEBUGTRAP(
Op, DAG);
7532 return lowerFMINNUM_FMAXNUM(
Op, DAG);
7535 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
7538 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
7541 return lowerFLDEXP(
Op, DAG);
7547 Op.getValueType() == MVT::i16 &&
7548 Op.getOperand(0).getValueType() == MVT::f32) {
7572 return lowerFCOPYSIGN(
Op, DAG);
7574 return lowerMUL(
Op, DAG);
7577 return lowerXMULO(
Op, DAG);
7580 return lowerXMUL_LOHI(
Op, DAG);
7615 EVT FittingLoadVT = LoadVT;
7647SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
7650 bool IsIntrinsic)
const {
7653 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7654 EVT LoadVT =
M->getValueType(0);
7656 EVT EquivLoadVT = LoadVT;
7670 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7674 M->getMemoryVT(),
M->getMemOperand());
7685 EVT LoadVT =
M->getValueType(0);
7691 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7692 bool IsTFE =
M->getNumValues() == 3;
7694 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7695 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7696 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7697 : AMDGPUISD::BUFFER_LOAD;
7700 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7705 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7709 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7710 M->getMemOperand(), DAG);
7714 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7716 M->getMemOperand(), DAG);
7724 EVT VT =
N->getValueType(0);
7725 unsigned CondCode =
N->getConstantOperandVal(3);
7736 EVT CmpVT =
LHS.getValueType();
7737 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7738 unsigned PromoteOp =
7758 EVT VT =
N->getValueType(0);
7760 unsigned CondCode =
N->getConstantOperandVal(3);
7769 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7778 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7787 EVT VT =
N->getValueType(0);
7811 Exec = AMDGPU::EXEC_LO;
7813 Exec = AMDGPU::EXEC;
7830 EVT VT =
N->getValueType(0);
7832 unsigned IID =
N->getConstantOperandVal(0);
7833 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7834 IID == Intrinsic::amdgcn_permlanex16;
7835 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7836 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7840 unsigned SplitSize = 32;
7841 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7842 ST->hasDPALU_DPP() &&
7850 case Intrinsic::amdgcn_permlane16:
7851 case Intrinsic::amdgcn_permlanex16:
7852 case Intrinsic::amdgcn_update_dpp:
7857 case Intrinsic::amdgcn_writelane:
7860 case Intrinsic::amdgcn_readlane:
7861 case Intrinsic::amdgcn_set_inactive:
7862 case Intrinsic::amdgcn_set_inactive_chain_arg:
7863 case Intrinsic::amdgcn_mov_dpp8:
7866 case Intrinsic::amdgcn_readfirstlane:
7867 case Intrinsic::amdgcn_permlane64:
7875 std::reverse(Operands.
begin(), Operands.
end());
7877 if (
SDNode *GL =
N->getGluedNode()) {
7879 GL = GL->getOperand(0).getNode();
7889 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7890 IID == Intrinsic::amdgcn_mov_dpp8 ||
7891 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7892 Src1 =
N->getOperand(2);
7893 if (IID == Intrinsic::amdgcn_writelane ||
7894 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7895 Src2 =
N->getOperand(3);
7898 if (ValSize == SplitSize) {
7908 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7913 if (IID == Intrinsic::amdgcn_writelane) {
7918 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7920 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7923 if (ValSize % SplitSize != 0)
7927 EVT VT =
N->getValueType(0);
7931 unsigned NumOperands =
N->getNumOperands();
7933 SDNode *GL =
N->getGluedNode();
7938 for (
unsigned i = 0; i != NE; ++i) {
7939 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7941 SDValue Operand =
N->getOperand(j);
7950 Operands[j] = Operand;
7955 Operands[NumOperands - 1] =
7971 if (SplitSize == 32) {
7973 return unrollLaneOp(LaneOp.
getNode());
7979 unsigned SubVecNumElt =
7983 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7984 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7988 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7993 if (IID == Intrinsic::amdgcn_writelane)
7998 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7999 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
8000 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
8001 EltIdx += SubVecNumElt;
8015 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
8018 if (IID == Intrinsic::amdgcn_writelane)
8021 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
8028 EVT VT =
N->getValueType(0);
8046 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
8050 Operands.
append(IntrinArgs);
8056 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8057 {ShiftedIndex, ValueI32});
8067 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8068 {ValueI32, PoisonVal});
8069 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8070 {ShiftedIndex, PoisonVal});
8073 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
8076 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8077 {WWMIndex, WWMValue});
8078 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
8079 MVT::i32, {WWMIndex, Swapped});
8081 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
8089 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
8097 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
8107 switch (
N->getOpcode()) {
8119 unsigned IID =
N->getConstantOperandVal(0);
8121 case Intrinsic::amdgcn_make_buffer_rsrc:
8122 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
8124 case Intrinsic::amdgcn_cvt_pkrtz: {
8129 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
8133 case Intrinsic::amdgcn_cvt_pknorm_i16:
8134 case Intrinsic::amdgcn_cvt_pknorm_u16:
8135 case Intrinsic::amdgcn_cvt_pk_i16:
8136 case Intrinsic::amdgcn_cvt_pk_u16: {
8142 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
8143 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8144 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
8145 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8146 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
8147 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8149 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8151 EVT VT =
N->getValueType(0);
8160 case Intrinsic::amdgcn_s_buffer_load: {
8166 if (!Subtarget->hasScalarSubwordLoads())
8172 EVT VT =
Op.getValueType();
8173 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
8185 if (!
Offset->isDivergent()) {
8204 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
8209 case Intrinsic::amdgcn_dead: {
8210 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
8221 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
8222 Results.push_back(Res.getOperand(
I));
8226 Results.push_back(Res.getValue(1));
8235 EVT VT =
N->getValueType(0);
8240 EVT SelectVT = NewVT;
8241 if (NewVT.
bitsLT(MVT::i32)) {
8244 SelectVT = MVT::i32;
8250 if (NewVT != SelectVT)
8256 if (
N->getValueType(0) != MVT::v2f16)
8268 if (
N->getValueType(0) != MVT::v2f16)
8280 if (
N->getValueType(0) != MVT::f16)
8295 if (U.get() !=
Value)
8298 if (U.getUser()->getOpcode() == Opcode)
8304unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
8307 case Intrinsic::amdgcn_if:
8308 return AMDGPUISD::IF;
8309 case Intrinsic::amdgcn_else:
8310 return AMDGPUISD::ELSE;
8311 case Intrinsic::amdgcn_loop:
8312 return AMDGPUISD::LOOP;
8313 case Intrinsic::amdgcn_end_cf:
8333 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8360 SDNode *Intr = BRCOND.getOperand(1).getNode();
8377 Intr =
LHS.getNode();
8385 assert(BR &&
"brcond missing unconditional branch user");
8390 unsigned CFNode = isCFIntrinsic(Intr);
8410 Ops.push_back(Target);
8433 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
8452 MVT VT =
Op.getSimpleValueType();
8455 if (
Op.getConstantOperandVal(0) != 0)
8459 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8461 if (
Info->isEntryFunction())
8478 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
8492 return Op.getValueType().bitsLE(VT)
8500 EVT DstVT =
Op.getValueType();
8507 unsigned Opc =
Op.getOpcode();
8519 EVT SrcVT = Src.getValueType();
8520 EVT DstVT =
Op.getValueType();
8523 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
8526 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
8533 if (DstVT == MVT::f16) {
8538 if (!Subtarget->has16BitInsts()) {
8543 if (
Op->getFlags().hasApproximateFuncs()) {
8554 "custom lower FP_ROUND for f16 or bf16");
8555 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
8567 EVT VT =
Op.getValueType();
8569 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8570 bool IsIEEEMode =
Info->getMode().IEEE;
8579 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8586SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
8588 EVT VT =
Op.getValueType();
8590 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8591 bool IsIEEEMode =
Info->getMode().IEEE;
8596 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8604 EVT VT =
Op.getValueType();
8608 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8609 !Subtarget->hasMinimum3Maximum3F16() &&
8610 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8611 "should not need to widen f16 minimum/maximum to v2f16");
8625 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8633 EVT VT =
Op.getValueType();
8637 EVT ExpVT =
Exp.getValueType();
8638 if (ExpVT == MVT::i16)
8659 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
8666 switch (
Op->getOpcode()) {
8696 DAGCombinerInfo &DCI)
const {
8697 const unsigned Opc =
Op.getOpcode();
8705 :
Op->getOperand(0).getValueType();
8706 auto &DAG = DCI.DAG;
8709 if (DCI.isBeforeLegalizeOps() ||
8717 LHS =
Op->getOperand(1);
8718 RHS =
Op->getOperand(2);
8720 LHS =
Op->getOperand(0);
8721 RHS =
Op->getOperand(1);
8760 if (MagVT == SignVT)
8777 EVT VT =
Op.getValueType();
8783 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8810 if (
Op->isDivergent())
8823 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8825 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8828 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8830 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8836 EVT VT =
Op.getValueType();
8843 const APInt &
C = RHSC->getAPIntValue();
8845 if (
C.isPowerOf2()) {
8847 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8874 if (
Op->isDivergent()) {
8878 if (Subtarget->hasSMulHi()) {
8889 if (!Subtarget->hasTrapHandler() ||
8891 return lowerTrapEndpgm(
Op, DAG);
8893 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8894 : lowerTrapHsaQueuePtr(
Op, DAG);
8900 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8904SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8906 ImplicitParameter Param)
const {
8910 MachinePointerInfo PtrInfo =
8927 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8930 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8933 if (UserSGPR == AMDGPU::NoRegister) {
8950 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8959 if (Subtarget->hasPrivEnabledTrap2NopBug())
8960 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8964 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8972 if (!Subtarget->hasTrapHandler() ||
8976 "debugtrap handler not supported",
8984 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8987SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8989 if (Subtarget->hasApertureRegs()) {
8991 ? AMDGPU::SRC_SHARED_BASE
8992 : AMDGPU::SRC_PRIVATE_BASE;
8993 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8994 !Subtarget->hasGloballyAddressableScratch()) &&
8995 "Cannot use src_private_base with globally addressable scratch!");
9016 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
9020 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
9022 if (UserSGPR == AMDGPU::NoRegister) {
9067 const AMDGPUTargetMachine &TM =
9070 unsigned DestAS, SrcAS;
9072 bool IsNonNull =
false;
9074 SrcAS = ASC->getSrcAddressSpace();
9075 Src = ASC->getOperand(0);
9076 DestAS = ASC->getDestAddressSpace();
9079 Op.getConstantOperandVal(0) ==
9080 Intrinsic::amdgcn_addrspacecast_nonnull);
9081 Src =
Op->getOperand(1);
9082 SrcAS =
Op->getConstantOperandVal(2);
9083 DestAS =
Op->getConstantOperandVal(3);
9096 Subtarget->hasGloballyAddressableScratch()) {
9101 AMDGPU::S_MOV_B32, SL, MVT::i32,
9102 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
9125 Subtarget->hasGloballyAddressableScratch()) {
9134 if (Subtarget->isWave64())
9140 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
9148 AMDGPU::S_MOV_B64, SL, MVT::i64,
9149 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
9151 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
9153 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
9173 Op.getValueType() == MVT::i64) {
9174 const SIMachineFunctionInfo *
Info =
9176 if (
Info->get32BitAddressHighBits() == 0)
9185 Src.getValueType() == MVT::i64)
9213 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
9218 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
9220 MVT::i32, InsNumElts / 2);
9225 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
9227 if (InsNumElts == 2) {
9240 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
9263 if (NumElts == 4 && EltSize == 16 && KIdx) {
9274 unsigned Idx = KIdx->getZExtValue();
9275 bool InsertLo = Idx < 2;
9279 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9285 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9298 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
9333 EVT ResultVT =
Op.getValueType();
9346 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
9349 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9353 if (VecSize == 128) {
9361 }
else if (VecSize == 256) {
9364 for (
unsigned P = 0;
P < 4; ++
P) {
9370 Parts[0], Parts[1]));
9372 Parts[2], Parts[3]));
9378 for (
unsigned P = 0;
P < 8; ++
P) {
9385 Parts[0], Parts[1], Parts[2], Parts[3]));
9388 Parts[4], Parts[5], Parts[6], Parts[7]));
9408 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9423 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9433 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9438 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9439 !(Mask[Elt + 1] & 1);
9445 EVT ResultVT =
Op.getValueType();
9448 const int NewSrcNumElts = 2;
9450 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
9466 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
9488 if (ShouldUseConsecutiveExtract &&
9491 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9492 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9504 if (Idx0 >= SrcNumElts) {
9509 if (Idx1 >= SrcNumElts) {
9514 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9515 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9523 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9524 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9529 if (SubVec0 != SubVec1) {
9530 NewMaskIdx1 += NewSrcNumElts;
9537 {NewMaskIdx0, NewMaskIdx1});
9542 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9543 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9544 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9545 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9564 EVT ResultVT =
Op.getValueType();
9580 EVT VT =
Op.getValueType();
9582 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9583 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
9617 for (
unsigned P = 0;
P < NumParts; ++
P) {
9619 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
9638 if (!Subtarget->isAmdHsaOS())
9681 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9690 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9698 EVT PtrVT =
Op.getValueType();
9700 const GlobalValue *GV = GSD->
getGlobal();
9714 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9729 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9732 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9733 if (Subtarget->has64BitLiterals()) {
9764 MachinePointerInfo PtrInfo =
9777 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9801 SDValue Param = lowerKernargMemParameter(
9812 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9820 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9828 unsigned NumElts = Elts.
size();
9830 if (NumElts <= 12) {
9839 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9845 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9855 EVT SrcVT = Src.getValueType();
9876 bool Unpacked,
bool IsD16,
int DMaskPop,
9877 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9881 EVT ReqRetVT = ResultTypes[0];
9883 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9884 ? (ReqRetNumElts + 1) / 2
9887 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9898 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9909 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9911 NumDataDwords - MaskPopDwords);
9916 EVT LegalReqRetVT = ReqRetVT;
9918 if (!
Data.getValueType().isInteger())
9920 Data.getValueType().changeTypeToInteger(),
Data);
9941 if (Result->getNumValues() == 1)
9948 SDValue *LWE,
bool &IsTexFail) {
9968 unsigned DimIdx,
unsigned EndIdx,
9969 unsigned NumGradients) {
9971 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9979 if (((
I + 1) >= EndIdx) ||
9980 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9981 I == DimIdx + NumGradients - 1))) {
10003 !
Op.getNode()->hasAnyUseOfValue(0))
10005 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
10015 ResultTypes.erase(&ResultTypes[0]);
10017 bool IsD16 =
false;
10018 bool IsG16 =
false;
10019 bool IsA16 =
false;
10021 int NumVDataDwords = 0;
10022 bool AdjustRetType =
false;
10023 bool IsAtomicPacked16Bit =
false;
10026 const unsigned ArgOffset = WithChain ? 2 : 1;
10029 unsigned DMaskLanes = 0;
10031 if (BaseOpcode->
Atomic) {
10032 VData =
Op.getOperand(2);
10034 IsAtomicPacked16Bit =
10035 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
10036 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
10037 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
10038 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
10049 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
10051 DMask = Is64Bit ? 0xf : 0x3;
10052 NumVDataDwords = Is64Bit ? 4 : 2;
10054 DMask = Is64Bit ? 0x3 : 0x1;
10055 NumVDataDwords = Is64Bit ? 2 : 1;
10058 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
10061 if (BaseOpcode->
Store) {
10062 VData =
Op.getOperand(2);
10066 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
10070 VData = handleD16VData(VData, DAG,
true);
10073 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
10074 }
else if (!BaseOpcode->
NoReturn) {
10079 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
10087 (!LoadVT.
isVector() && DMaskLanes > 1))
10093 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
10094 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
10095 NumVDataDwords = (DMaskLanes + 1) / 2;
10097 NumVDataDwords = DMaskLanes;
10099 AdjustRetType =
true;
10103 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
10110 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10111 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10113 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
10115 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10116 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10120 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
10126 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
10130 "Bias needs to be converted to 16 bit in A16 mode");
10135 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
10139 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
10140 "require 16 bit args for both gradients and addresses");
10145 if (!
ST->hasA16()) {
10146 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
10147 "support 16 bit addresses\n");
10157 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
10159 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
10161 IntrOpcode = G16MappingInfo->
G16;
10184 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
10202 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
10203 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
10204 const bool UseNSA =
ST->hasNSAEncoding() &&
10205 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
10206 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
10207 const bool UsePartialNSA =
10208 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
10211 if (UsePartialNSA) {
10213 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
10214 }
else if (!UseNSA) {
10224 uint64_t UnormConst =
10225 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
10227 Unorm = UnormConst ? True : False;
10233 bool IsTexFail =
false;
10234 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10243 NumVDataDwords = 1;
10245 NumVDataDwords += 1;
10246 AdjustRetType =
true;
10251 if (AdjustRetType) {
10254 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
10263 MVT::i32, NumVDataDwords)
10266 ResultTypes[0] = NewVT;
10267 if (ResultTypes.size() == 3) {
10271 ResultTypes.erase(&ResultTypes[1]);
10285 Ops.push_back(VData);
10286 if (UsePartialNSA) {
10288 Ops.push_back(VAddr);
10292 Ops.push_back(VAddr);
10295 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10297 Ops.push_back(Rsrc);
10302 Ops.push_back(Samp);
10307 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10308 Ops.push_back(Unorm);
10310 Ops.push_back(IsA16 &&
10311 ST->hasFeature(AMDGPU::FeatureR128A16)
10315 Ops.push_back(IsA16 ? True : False);
10317 if (!Subtarget->hasGFX90AInsts())
10318 Ops.push_back(TFE);
10322 "TFE is not supported on this GPU",
DL.getDebugLoc()));
10325 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10326 Ops.push_back(LWE);
10328 Ops.push_back(DimInfo->
DA ? True : False);
10330 Ops.push_back(IsD16 ? True : False);
10332 Ops.push_back(
Op.getOperand(0));
10334 int NumVAddrDwords =
10340 NumVDataDwords, NumVAddrDwords);
10341 }
else if (IsGFX11Plus) {
10343 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10344 : AMDGPU::MIMGEncGfx11Default,
10345 NumVDataDwords, NumVAddrDwords);
10346 }
else if (IsGFX10Plus) {
10348 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10349 : AMDGPU::MIMGEncGfx10Default,
10350 NumVDataDwords, NumVAddrDwords);
10352 if (Subtarget->hasGFX90AInsts()) {
10354 NumVDataDwords, NumVAddrDwords);
10355 if (Opcode == -1) {
10358 "requested image instruction is not supported on this GPU",
10359 DL.getDebugLoc()));
10363 for (EVT VT : OrigResultTypes) {
10364 if (VT == MVT::Other)
10365 RetValues[Idx++] =
Op.getOperand(0);
10373 if (Opcode == -1 &&
10376 NumVDataDwords, NumVAddrDwords);
10379 NumVDataDwords, NumVAddrDwords);
10386 MachineMemOperand *MemRef = MemOp->getMemOperand();
10405 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10406 NumVDataDwords, IsAtomicPacked16Bit,
DL);
10419 MachinePointerInfo(),
10424 if (!
Offset->isDivergent()) {
10431 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10440 !Subtarget->hasScalarDwordx3Loads()) {
10444 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
10467 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10469 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
10473 unsigned NumLoads = 1;
10479 if (NumElts == 8 || NumElts == 16) {
10480 NumLoads = NumElts / 4;
10484 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
10489 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
10491 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
10493 for (
unsigned i = 0; i < NumLoads; ++i) {
10496 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
10497 LoadVT, LoadMMO, DAG));
10500 if (NumElts == 8 || NumElts == 16)
10508 if (!Subtarget->hasArchitectedSGPRs())
10513 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10520 unsigned Width)
const {
10522 using namespace AMDGPU::Hwreg;
10524 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10563 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
10565 EVT VT =
Op.getValueType();
10567 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
10571 switch (IntrinsicID) {
10572 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10575 return getPreloadedValue(DAG, *MFI, VT,
10578 case Intrinsic::amdgcn_dispatch_ptr:
10579 case Intrinsic::amdgcn_queue_ptr: {
10580 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
10582 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
10583 DL.getDebugLoc()));
10587 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10590 return getPreloadedValue(DAG, *MFI, VT, RegID);
10592 case Intrinsic::amdgcn_implicitarg_ptr: {
10594 return getImplicitArgPtr(DAG,
DL);
10595 return getPreloadedValue(DAG, *MFI, VT,
10598 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10604 return getPreloadedValue(DAG, *MFI, VT,
10607 case Intrinsic::amdgcn_dispatch_id: {
10610 case Intrinsic::amdgcn_rcp:
10611 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
10612 case Intrinsic::amdgcn_rsq:
10613 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
10614 case Intrinsic::amdgcn_rsq_legacy:
10618 case Intrinsic::amdgcn_rcp_legacy:
10621 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
10622 case Intrinsic::amdgcn_rsq_clamp: {
10624 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
10636 case Intrinsic::r600_read_ngroups_x:
10637 if (Subtarget->isAmdHsaOS())
10640 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10643 case Intrinsic::r600_read_ngroups_y:
10644 if (Subtarget->isAmdHsaOS())
10647 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10650 case Intrinsic::r600_read_ngroups_z:
10651 if (Subtarget->isAmdHsaOS())
10654 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10657 case Intrinsic::r600_read_local_size_x:
10658 if (Subtarget->isAmdHsaOS())
10661 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10663 case Intrinsic::r600_read_local_size_y:
10664 if (Subtarget->isAmdHsaOS())
10667 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10669 case Intrinsic::r600_read_local_size_z:
10670 if (Subtarget->isAmdHsaOS())
10673 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10675 case Intrinsic::amdgcn_workgroup_id_x:
10676 return lowerWorkGroupId(DAG, *MFI, VT,
10680 case Intrinsic::amdgcn_workgroup_id_y:
10681 return lowerWorkGroupId(DAG, *MFI, VT,
10685 case Intrinsic::amdgcn_workgroup_id_z:
10686 return lowerWorkGroupId(DAG, *MFI, VT,
10690 case Intrinsic::amdgcn_cluster_id_x:
10691 return Subtarget->hasClusters()
10692 ? getPreloadedValue(DAG, *MFI, VT,
10694 : DAG.getPOISON(VT);
10695 case Intrinsic::amdgcn_cluster_id_y:
10696 return Subtarget->hasClusters()
10697 ? getPreloadedValue(DAG, *MFI, VT,
10700 case Intrinsic::amdgcn_cluster_id_z:
10701 return Subtarget->hasClusters()
10702 ? getPreloadedValue(DAG, *MFI, VT,
10705 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10706 return Subtarget->hasClusters()
10707 ? getPreloadedValue(
10711 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10712 return Subtarget->hasClusters()
10713 ? getPreloadedValue(
10717 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10718 return Subtarget->hasClusters()
10719 ? getPreloadedValue(
10723 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10724 return Subtarget->hasClusters()
10727 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10728 return Subtarget->hasClusters()
10729 ? getPreloadedValue(
10733 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10734 return Subtarget->hasClusters()
10735 ? getPreloadedValue(
10739 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10740 return Subtarget->hasClusters()
10741 ? getPreloadedValue(
10745 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10746 return Subtarget->hasClusters()
10747 ? getPreloadedValue(
10751 case Intrinsic::amdgcn_wave_id:
10752 return lowerWaveID(DAG,
Op);
10753 case Intrinsic::amdgcn_lds_kernel_id: {
10755 return getLDSKernelId(DAG,
DL);
10756 return getPreloadedValue(DAG, *MFI, VT,
10759 case Intrinsic::amdgcn_workitem_id_x:
10760 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10761 case Intrinsic::amdgcn_workitem_id_y:
10762 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10763 case Intrinsic::amdgcn_workitem_id_z:
10764 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10765 case Intrinsic::amdgcn_wavefrontsize:
10767 SDLoc(
Op), MVT::i32);
10768 case Intrinsic::amdgcn_s_buffer_load: {
10769 unsigned CPol =
Op.getConstantOperandVal(3);
10776 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10777 Op.getOperand(3), DAG);
10779 case Intrinsic::amdgcn_fdiv_fast:
10780 return lowerFDIV_FAST(
Op, DAG);
10781 case Intrinsic::amdgcn_sin:
10782 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10784 case Intrinsic::amdgcn_cos:
10785 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10787 case Intrinsic::amdgcn_mul_u24:
10788 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10790 case Intrinsic::amdgcn_mul_i24:
10791 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10794 case Intrinsic::amdgcn_log_clamp: {
10800 case Intrinsic::amdgcn_fract:
10801 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10803 case Intrinsic::amdgcn_class:
10804 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10806 case Intrinsic::amdgcn_div_fmas:
10807 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10808 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10810 case Intrinsic::amdgcn_div_fixup:
10811 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10812 Op.getOperand(2),
Op.getOperand(3));
10814 case Intrinsic::amdgcn_div_scale: {
10820 SDValue Denominator =
Op.getOperand(2);
10827 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10829 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10830 Denominator, Numerator);
10832 case Intrinsic::amdgcn_icmp: {
10834 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10835 Op.getConstantOperandVal(2) == 0 &&
10840 case Intrinsic::amdgcn_fcmp: {
10843 case Intrinsic::amdgcn_ballot:
10845 case Intrinsic::amdgcn_fmed3:
10846 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10847 Op.getOperand(2),
Op.getOperand(3));
10848 case Intrinsic::amdgcn_fdot2:
10849 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10850 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10851 case Intrinsic::amdgcn_fmul_legacy:
10852 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10854 case Intrinsic::amdgcn_sbfe:
10855 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10856 Op.getOperand(2),
Op.getOperand(3));
10857 case Intrinsic::amdgcn_ubfe:
10858 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10859 Op.getOperand(2),
Op.getOperand(3));
10860 case Intrinsic::amdgcn_cvt_pkrtz:
10861 case Intrinsic::amdgcn_cvt_pknorm_i16:
10862 case Intrinsic::amdgcn_cvt_pknorm_u16:
10863 case Intrinsic::amdgcn_cvt_pk_i16:
10864 case Intrinsic::amdgcn_cvt_pk_u16: {
10866 EVT VT =
Op.getValueType();
10869 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10870 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10871 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10872 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10873 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10874 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10875 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10876 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10878 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10881 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10884 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10887 case Intrinsic::amdgcn_fmad_ftz:
10888 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10889 Op.getOperand(2),
Op.getOperand(3));
10891 case Intrinsic::amdgcn_if_break:
10893 Op->getOperand(1),
Op->getOperand(2)),
10896 case Intrinsic::amdgcn_groupstaticsize: {
10902 const GlobalValue *GV =
10908 case Intrinsic::amdgcn_is_shared:
10909 case Intrinsic::amdgcn_is_private: {
10916 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10920 Subtarget->hasGloballyAddressableScratch()) {
10923 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10924 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10933 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10936 case Intrinsic::amdgcn_perm:
10937 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10938 Op.getOperand(2),
Op.getOperand(3));
10939 case Intrinsic::amdgcn_reloc_constant: {
10949 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10950 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10951 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10952 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10953 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10954 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10955 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10956 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10957 if (
Op.getOperand(4).getValueType() == MVT::i32)
10963 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10964 Op.getOperand(3), IndexKeyi32);
10966 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10967 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10968 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10969 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10970 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10971 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10972 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10973 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10974 if (
Op.getOperand(4).getValueType() == MVT::i64)
10979 Op.getOperand(4).getValueType() == MVT::v2i32
10983 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10984 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10985 Op.getOperand(6)});
10987 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10988 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10989 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10990 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10991 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10992 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10993 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10996 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
11001 Op.getOperand(6).getValueType().isVector()
11005 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
11006 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
11007 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
11008 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
11009 Args.push_back(
Op.getOperand(9));
11012 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
11013 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
11014 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
11015 if (
Op.getOperand(6).getValueType() == MVT::i32)
11021 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11022 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11023 IndexKeyi32, Op.getOperand(7)});
11025 case Intrinsic::amdgcn_addrspacecast_nonnull:
11026 return lowerADDRSPACECAST(
Op, DAG);
11027 case Intrinsic::amdgcn_readlane:
11028 case Intrinsic::amdgcn_readfirstlane:
11029 case Intrinsic::amdgcn_writelane:
11030 case Intrinsic::amdgcn_permlane16:
11031 case Intrinsic::amdgcn_permlanex16:
11032 case Intrinsic::amdgcn_permlane64:
11033 case Intrinsic::amdgcn_set_inactive:
11034 case Intrinsic::amdgcn_set_inactive_chain_arg:
11035 case Intrinsic::amdgcn_mov_dpp8:
11036 case Intrinsic::amdgcn_update_dpp:
11038 case Intrinsic::amdgcn_dead: {
11040 for (
const EVT ValTy :
Op.getNode()->values())
11044 case Intrinsic::amdgcn_wave_shuffle:
11047 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11049 return lowerImage(
Op, ImageDimIntr, DAG,
false);
11059 if (Subtarget->hasRestrictedSOffset() &&
isNullConstant(SOffset))
11060 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
11066 unsigned NewOpcode)
const {
11070 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11071 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11089 M->getMemOperand());
11094 unsigned NewOpcode)
const {
11098 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11099 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11117 M->getMemOperand());
11122 unsigned IntrID =
Op.getConstantOperandVal(1);
11126 case Intrinsic::amdgcn_ds_ordered_add:
11127 case Intrinsic::amdgcn_ds_ordered_swap: {
11132 unsigned IndexOperand =
M->getConstantOperandVal(7);
11133 unsigned WaveRelease =
M->getConstantOperandVal(8);
11134 unsigned WaveDone =
M->getConstantOperandVal(9);
11136 unsigned OrderedCountIndex = IndexOperand & 0x3f;
11137 IndexOperand &= ~0x3f;
11138 unsigned CountDw = 0;
11141 CountDw = (IndexOperand >> 24) & 0xf;
11142 IndexOperand &= ~(0xf << 24);
11144 if (CountDw < 1 || CountDw > 4) {
11147 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
11148 DL.getDebugLoc()));
11153 if (IndexOperand) {
11156 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
11159 if (WaveDone && !WaveRelease) {
11163 Fn,
"ds_ordered_count: wave_done requires wave_release",
11164 DL.getDebugLoc()));
11167 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
11168 unsigned ShaderType =
11170 unsigned Offset0 = OrderedCountIndex << 2;
11171 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
11174 Offset1 |= (CountDw - 1) << 6;
11177 Offset1 |= ShaderType << 2;
11179 unsigned Offset = Offset0 | (Offset1 << 8);
11186 M->getVTList(),
Ops,
M->getMemoryVT(),
11187 M->getMemOperand());
11189 case Intrinsic::amdgcn_raw_buffer_load:
11190 case Intrinsic::amdgcn_raw_ptr_buffer_load:
11191 case Intrinsic::amdgcn_raw_atomic_buffer_load:
11192 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
11193 case Intrinsic::amdgcn_raw_buffer_load_format:
11194 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
11195 const bool IsFormat =
11196 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
11197 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
11199 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11200 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
11214 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
11216 case Intrinsic::amdgcn_struct_buffer_load:
11217 case Intrinsic::amdgcn_struct_ptr_buffer_load:
11218 case Intrinsic::amdgcn_struct_buffer_load_format:
11219 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
11220 case Intrinsic::amdgcn_struct_atomic_buffer_load:
11221 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
11222 const bool IsFormat =
11223 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
11224 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
11226 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11227 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11242 case Intrinsic::amdgcn_raw_tbuffer_load:
11243 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11245 EVT LoadVT =
Op.getValueType();
11246 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11247 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
11263 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11265 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11266 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11269 case Intrinsic::amdgcn_struct_tbuffer_load:
11270 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11272 EVT LoadVT =
Op.getValueType();
11273 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11274 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11290 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11292 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11293 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11296 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11298 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11299 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11301 return lowerStructBufferAtomicIntrin(
Op, DAG,
11302 AMDGPUISD::BUFFER_ATOMIC_FADD);
11303 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11304 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11305 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11306 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11308 return lowerStructBufferAtomicIntrin(
Op, DAG,
11309 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11310 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11311 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11312 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11313 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11314 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11315 return lowerStructBufferAtomicIntrin(
Op, DAG,
11316 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11317 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11318 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11319 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11320 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11322 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11323 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11324 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11325 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11326 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11327 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11328 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11329 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11330 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11331 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11332 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11334 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11335 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11336 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11337 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11338 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11339 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11340 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11341 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11342 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11343 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11344 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11345 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11346 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11347 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11348 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11349 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11350 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11351 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11352 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11353 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11354 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11355 return lowerStructBufferAtomicIntrin(
Op, DAG,
11356 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11357 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11358 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11359 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11360 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11361 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11362 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11363 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11364 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11365 return lowerStructBufferAtomicIntrin(
Op, DAG,
11366 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11367 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11369 return lowerStructBufferAtomicIntrin(
Op, DAG,
11370 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11371 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11372 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11373 return lowerStructBufferAtomicIntrin(
Op, DAG,
11374 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11375 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11376 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11377 return lowerStructBufferAtomicIntrin(
Op, DAG,
11378 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11379 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11380 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11381 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11382 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11383 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11384 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11385 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11386 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11387 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11388 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11389 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11390 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11391 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11392 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11393 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11394 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11395 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11396 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11397 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11398 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11399 return lowerStructBufferAtomicIntrin(
Op, DAG,
11400 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11401 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11402 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11403 return lowerRawBufferAtomicIntrin(
Op, DAG,
11404 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11405 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11406 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11407 return lowerStructBufferAtomicIntrin(
Op, DAG,
11408 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11409 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11410 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11411 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
11412 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11426 EVT VT =
Op.getValueType();
11430 Op->getVTList(),
Ops, VT,
11431 M->getMemOperand());
11433 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11434 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11435 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
11436 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
11450 EVT VT =
Op.getValueType();
11454 Op->getVTList(),
Ops, VT,
11455 M->getMemOperand());
11457 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11458 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11460 SDValue NodePtr =
M->getOperand(2);
11461 SDValue RayExtent =
M->getOperand(3);
11462 SDValue InstanceMask =
M->getOperand(4);
11463 SDValue RayOrigin =
M->getOperand(5);
11464 SDValue RayDir =
M->getOperand(6);
11466 SDValue TDescr =
M->getOperand(8);
11471 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11476 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11477 const unsigned NumVDataDwords = 10;
11478 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11480 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11481 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11482 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11486 Ops.push_back(NodePtr);
11489 {DAG.getBitcast(MVT::i32, RayExtent),
11490 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11491 Ops.push_back(RayOrigin);
11492 Ops.push_back(RayDir);
11493 Ops.push_back(Offsets);
11494 Ops.push_back(TDescr);
11495 Ops.push_back(
M->getChain());
11498 MachineMemOperand *MemRef =
M->getMemOperand();
11502 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11504 SDValue NodePtr =
M->getOperand(2);
11505 SDValue RayExtent =
M->getOperand(3);
11506 SDValue RayOrigin =
M->getOperand(4);
11507 SDValue RayDir =
M->getOperand(5);
11508 SDValue RayInvDir =
M->getOperand(6);
11509 SDValue TDescr =
M->getOperand(7);
11516 if (!Subtarget->hasGFX10_AEncoding()) {
11526 const unsigned NumVDataDwords = 4;
11527 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11528 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11529 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11532 const unsigned BaseOpcodes[2][2] = {
11533 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11534 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11535 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11539 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11540 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11541 : AMDGPU::MIMGEncGfx10NSA,
11542 NumVDataDwords, NumVAddrDwords);
11546 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11547 : AMDGPU::MIMGEncGfx10Default,
11548 NumVDataDwords, NumVAddrDwords);
11554 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
11557 if (Lanes[0].getValueSizeInBits() == 32) {
11558 for (
unsigned I = 0;
I < 3; ++
I)
11565 Ops.push_back(Lanes[2]);
11577 if (UseNSA && IsGFX11Plus) {
11578 Ops.push_back(NodePtr);
11580 Ops.push_back(RayOrigin);
11585 for (
unsigned I = 0;
I < 3; ++
I) {
11588 {DirLanes[I], InvDirLanes[I]})));
11592 Ops.push_back(RayDir);
11593 Ops.push_back(RayInvDir);
11600 Ops.push_back(NodePtr);
11603 packLanes(RayOrigin,
true);
11604 packLanes(RayDir,
true);
11605 packLanes(RayInvDir,
false);
11610 if (NumVAddrDwords > 12) {
11618 Ops.push_back(MergedOps);
11621 Ops.push_back(TDescr);
11623 Ops.push_back(
M->getChain());
11626 MachineMemOperand *MemRef =
M->getMemOperand();
11630 case Intrinsic::amdgcn_global_atomic_fmin_num:
11631 case Intrinsic::amdgcn_global_atomic_fmax_num:
11632 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11633 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11640 unsigned Opcode = 0;
11642 case Intrinsic::amdgcn_global_atomic_fmin_num:
11643 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11647 case Intrinsic::amdgcn_global_atomic_fmax_num:
11648 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11655 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
11656 Ops,
M->getMemOperand());
11658 case Intrinsic::amdgcn_s_alloc_vgpr: {
11666 ReadFirstLaneID, NumVGPRs);
11669 Op.getOperand(0),
Op.getOperand(1), NumVGPRs);
11671 case Intrinsic::amdgcn_s_get_barrier_state:
11672 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11679 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11680 BarID = (BarID >> 4) & 0x3F;
11681 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11684 Ops.push_back(Chain);
11686 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11687 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11695 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11703 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11704 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11705 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11709 EVT VT =
Op->getValueType(0);
11713 case Intrinsic::amdgcn_flat_load_monitor_b32:
11714 case Intrinsic::amdgcn_flat_load_monitor_b64:
11715 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11720 Op->getVTList(), {Chain, Ptr},
11723 case Intrinsic::amdgcn_global_load_monitor_b32:
11724 case Intrinsic::amdgcn_global_load_monitor_b64:
11725 case Intrinsic::amdgcn_global_load_monitor_b128: {
11730 Op->getVTList(), {Chain, Ptr},
11735 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11737 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11745SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11752 EVT VT = VTList.
VTs[0];
11755 bool IsTFE = VTList.
NumVTs == 3;
11758 unsigned NumOpDWords = NumValueDWords + 1;
11760 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11761 MachineMemOperand *OpDWordsMMO =
11763 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11764 OpDWordsVT, OpDWordsMMO, DAG);
11769 NumValueDWords == 1
11778 if (!Subtarget->hasDwordx3LoadStores() &&
11779 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11783 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11785 WidenedMemVT, WidenedMMO);
11795 bool ImageStore)
const {
11805 if (Subtarget->hasUnpackedD16VMem()) {
11819 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11830 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11836 if ((NumElements % 2) == 1) {
11838 unsigned I = Elts.
size() / 2;
11854 if (NumElements == 3) {
11873 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11874 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11875 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11876 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11877 case Intrinsic::amdgcn_load_async_to_lds:
11878 case Intrinsic::amdgcn_global_load_async_lds:
11888 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11890 switch (IntrinsicID) {
11891 case Intrinsic::amdgcn_exp_compr: {
11892 if (!Subtarget->hasCompressedExport()) {
11895 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11917 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11921 case Intrinsic::amdgcn_struct_tbuffer_store:
11922 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11924 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11926 VData = handleD16VData(VData, DAG);
11927 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11928 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11942 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11943 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11946 M->getMemoryVT(),
M->getMemOperand());
11949 case Intrinsic::amdgcn_raw_tbuffer_store:
11950 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11952 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11954 VData = handleD16VData(VData, DAG);
11955 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11956 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11970 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11971 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11974 M->getMemoryVT(),
M->getMemOperand());
11977 case Intrinsic::amdgcn_raw_buffer_store:
11978 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11979 case Intrinsic::amdgcn_raw_buffer_store_format:
11980 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11981 const bool IsFormat =
11982 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11983 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11990 VData = handleD16VData(VData, DAG);
12000 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12001 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
12015 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
12016 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
12021 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
12024 M->getMemoryVT(),
M->getMemOperand());
12027 case Intrinsic::amdgcn_struct_buffer_store:
12028 case Intrinsic::amdgcn_struct_ptr_buffer_store:
12029 case Intrinsic::amdgcn_struct_buffer_store_format:
12030 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
12031 const bool IsFormat =
12032 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
12033 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
12041 VData = handleD16VData(VData, DAG);
12051 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12052 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
12066 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
12067 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
12071 EVT VDataType = VData.getValueType().getScalarType();
12073 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
12076 M->getMemoryVT(),
M->getMemOperand());
12078 case Intrinsic::amdgcn_raw_buffer_load_lds:
12079 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12080 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
12081 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12082 case Intrinsic::amdgcn_struct_buffer_load_lds:
12083 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12084 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
12085 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
12086 if (!Subtarget->hasVMemToLDSLoad())
12090 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
12091 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
12092 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
12093 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
12094 unsigned OpOffset = HasVIndex ? 1 : 0;
12095 SDValue VOffset =
Op.getOperand(5 + OpOffset);
12097 unsigned Size =
Op->getConstantOperandVal(4);
12103 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
12104 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
12105 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
12106 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
12109 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
12110 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
12111 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
12112 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
12115 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
12116 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
12117 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
12118 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
12121 if (!Subtarget->hasLDSLoadB96_B128())
12123 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
12124 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
12125 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
12126 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
12129 if (!Subtarget->hasLDSLoadB96_B128())
12131 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
12132 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
12133 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
12134 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
12142 if (HasVIndex && HasVOffset)
12146 else if (HasVIndex)
12147 Ops.push_back(
Op.getOperand(5));
12148 else if (HasVOffset)
12149 Ops.push_back(VOffset);
12151 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
12152 Ops.push_back(Rsrc);
12153 Ops.push_back(
Op.getOperand(6 + OpOffset));
12154 Ops.push_back(
Op.getOperand(7 + OpOffset));
12156 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
12179 case Intrinsic::amdgcn_load_to_lds:
12180 case Intrinsic::amdgcn_load_async_to_lds:
12181 case Intrinsic::amdgcn_global_load_lds:
12182 case Intrinsic::amdgcn_global_load_async_lds: {
12183 if (!Subtarget->hasVMemToLDSLoad())
12187 unsigned Size =
Op->getConstantOperandVal(4);
12192 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
12195 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
12198 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
12201 if (!Subtarget->hasLDSLoadB96_B128())
12203 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
12206 if (!Subtarget->hasLDSLoadB96_B128())
12208 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
12224 if (
LHS->isDivergent())
12228 RHS.getOperand(0).getValueType() == MVT::i32) {
12231 VOffset =
RHS.getOperand(0);
12235 Ops.push_back(Addr);
12243 Ops.push_back(VOffset);
12246 Ops.push_back(
Op.getOperand(5));
12248 unsigned Aux =
Op.getConstantOperandVal(6);
12263 case Intrinsic::amdgcn_end_cf:
12265 Op->getOperand(2), Chain),
12267 case Intrinsic::amdgcn_s_barrier_signal_var: {
12274 if (CntC && CntC->isZero()) {
12279 std::optional<uint64_t> BarVal;
12281 BarVal =
C->getZExtValue();
12285 BarVal = *Addr + GA->getOffset();
12288 unsigned BarID = (*BarVal >> 4) & 0x3F;
12290 Ops.push_back(Chain);
12292 Op->getVTList(),
Ops);
12298 case Intrinsic::amdgcn_s_barrier_init: {
12305 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12306 ? AMDGPU::S_BARRIER_INIT_M0
12307 : AMDGPU::S_BARRIER_SIGNAL_M0;
12322 constexpr unsigned ShAmt = 16;
12329 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
12334 case Intrinsic::amdgcn_s_wakeup_barrier: {
12335 if (!Subtarget->hasSWakeupBarrier())
12339 case Intrinsic::amdgcn_s_barrier_join: {
12348 switch (IntrinsicID) {
12351 case Intrinsic::amdgcn_s_barrier_join:
12352 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12354 case Intrinsic::amdgcn_s_wakeup_barrier:
12355 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12359 unsigned BarID = (BarVal >> 4) & 0x3F;
12362 Ops.push_back(Chain);
12364 switch (IntrinsicID) {
12367 case Intrinsic::amdgcn_s_barrier_join:
12368 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12370 case Intrinsic::amdgcn_s_wakeup_barrier:
12371 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12382 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
12388 case Intrinsic::amdgcn_s_prefetch_data: {
12391 return Op.getOperand(0);
12394 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12396 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
12403 Op->getVTList(),
Ops,
M->getMemoryVT(),
12404 M->getMemOperand());
12406 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12407 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12408 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12417 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12419 return lowerImage(
Op, ImageDimIntr, DAG,
true);
12435 return PtrVT == MVT::i64;
12449std::pair<SDValue, SDValue>
12462 bool CheckNUW = Subtarget->hasGFX1250Insts();
12479 unsigned Overflow = ImmOffset & ~MaxImm;
12480 ImmOffset -= Overflow;
12481 if ((int32_t)Overflow < 0) {
12482 Overflow += ImmOffset;
12487 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
12506void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
12508 Align Alignment)
const {
12510 SDLoc
DL(CombinedOffset);
12512 uint32_t
Imm =
C->getZExtValue();
12513 uint32_t SOffset, ImmOffset;
12514 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12525 bool CheckNUW = Subtarget->hasGFX1250Insts();
12528 uint32_t SOffset, ImmOffset;
12531 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
12539 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12548SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
12551 return MaybePointer;
12565 SDValue NumRecords =
Op->getOperand(3);
12571 if (Subtarget->has45BitNumRecordsBufferResource()) {
12590 SDValue ExtShiftedStrideVec =
12602 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12604 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12609 auto [LowHalf, HighHalf] =
12610 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12620 NumRecords, Flags);
12632 bool IsTFE)
const {
12637 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12638 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12641 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
12653 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12654 : AMDGPUISD::BUFFER_LOAD_USHORT;
12656 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
12670 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12674 Ops[1] = BufferStoreExt;
12675 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12676 : AMDGPUISD::BUFFER_STORE_SHORT;
12679 M->getMemOperand());
12704 DAGCombinerInfo &DCI)
const {
12705 SelectionDAG &DAG = DCI.DAG;
12720 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
12727 "unexpected vector extload");
12740 "unexpected fp extload");
12758 DCI.AddToWorklist(Cvt.
getNode());
12763 DCI.AddToWorklist(Cvt.
getNode());
12774 if (Info.isEntryFunction())
12775 return Info.getUserSGPRInfo().hasFlatScratchInit();
12783 EVT MemVT =
Load->getMemoryVT();
12784 MachineMemOperand *MMO =
Load->getMemOperand();
12796 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12824 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12825 "Custom lowering for non-i32 vectors hasn't been implemented.");
12828 unsigned AS =
Load->getAddressSpace();
12829 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12836 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12840 !Subtarget->hasMultiDwordFlatScratchAddressing())
12850 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12853 Alignment >=
Align(4) && NumElements < 32) {
12855 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12867 if (NumElements > 4)
12870 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12880 switch (Subtarget->getMaxPrivateElementSize()) {
12886 if (NumElements > 2)
12891 if (NumElements > 4)
12894 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12903 auto Flags =
Load->getMemOperand()->getFlags();
12905 Load->getAlign(), Flags, &
Fast) &&
12914 MemVT, *
Load->getMemOperand())) {
12923 EVT VT =
Op.getValueType();
12960 EVT VT =
Op.getValueType();
12961 const SDNodeFlags
Flags =
Op->getFlags();
12963 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12969 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12972 if (CLHS->isExactlyValue(1.0)) {
12985 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12989 if (CLHS->isExactlyValue(-1.0)) {
12992 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12998 if (!AllowInaccurateRcp &&
12999 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
13013 EVT VT =
Op.getValueType();
13014 const SDNodeFlags
Flags =
Op->getFlags();
13016 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
13017 if (!AllowInaccurateDiv)
13038 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
13048 Opcode = AMDGPUISD::FMUL_W_CHAIN;
13052 return DAG.
getNode(Opcode, SL, VTList,
13061 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
13071 Opcode = AMDGPUISD::FMA_W_CHAIN;
13075 return DAG.
getNode(Opcode, SL, VTList,
13081 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
13082 return FastLowered;
13085 EVT VT =
Op.getValueType();
13092 if (VT == MVT::bf16) {
13115 unsigned FMADOpCode =
13119 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
13122 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13124 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
13125 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13135 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
13141 SDNodeFlags
Flags =
Op->getFlags();
13151 const APFloat K0Val(0x1p+96f);
13154 const APFloat K1Val(0x1p-32f);
13181 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
13182 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
13183 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
13188 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
13189 return FastLowered;
13195 SDNodeFlags
Flags =
Op->getFlags();
13196 Flags.setNoFPExcept(
true);
13204 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
13213 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
13217 using namespace AMDGPU::Hwreg;
13218 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
13222 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
13223 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
13226 const bool HasDynamicDenormals =
13232 if (!PreservesDenormals) {
13237 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
13240 if (HasDynamicDenormals) {
13244 SavedDenormMode =
SDValue(GetReg, 0);
13250 SDNode *EnableDenorm;
13251 if (Subtarget->hasDenormModeInst()) {
13252 const SDValue EnableDenormValue =
13255 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
13259 const SDValue EnableDenormValue =
13261 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
13262 {EnableDenormValue,
BitField, Glue});
13272 ApproxRcp, One, NegDivScale0, Flags);
13275 ApproxRcp, Fma0, Flags);
13281 NumeratorScaled,
Mul, Flags);
13287 NumeratorScaled, Fma3, Flags);
13289 if (!PreservesDenormals) {
13290 SDNode *DisableDenorm;
13291 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13295 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
13297 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13301 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
13302 const SDValue DisableDenormValue =
13303 HasDynamicDenormals
13308 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13319 {Fma4, Fma1, Fma3, Scale},
Flags);
13321 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
13325 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
13326 return FastLowered;
13334 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
13340 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13358 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13388 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
13390 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
13394 EVT VT =
Op.getValueType();
13396 if (VT == MVT::f32)
13397 return LowerFDIV32(
Op, DAG);
13399 if (VT == MVT::f64)
13400 return LowerFDIV64(
Op, DAG);
13402 if (VT == MVT::f16 || VT == MVT::bf16)
13403 return LowerFDIV16(
Op, DAG);
13412 EVT ResultExpVT =
Op->getValueType(1);
13413 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13423 if (Subtarget->hasFractBug()) {
13441 EVT VT =
Store->getMemoryVT();
13443 if (VT == MVT::i1) {
13447 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
13451 Store->getValue().getValueType().getScalarType() == MVT::i32);
13453 unsigned AS =
Store->getAddressSpace();
13454 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13462 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
13466 !Subtarget->hasMultiDwordFlatScratchAddressing())
13473 if (NumElements > 4)
13476 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13480 VT, *
Store->getMemOperand()))
13486 switch (Subtarget->getMaxPrivateElementSize()) {
13490 if (NumElements > 2)
13494 if (NumElements > 4 ||
13495 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13503 auto Flags =
Store->getMemOperand()->getFlags();
13522 assert(!Subtarget->has16BitInsts());
13523 SDNodeFlags
Flags =
Op->getFlags();
13537 SDNodeFlags
Flags =
Op->getFlags();
13538 MVT VT =
Op.getValueType().getSimpleVT();
13646 SDNodeFlags
Flags =
Op->getFlags();
13655 if (!
Flags.hasApproximateFuncs()) {
13687 if (!
Flags.hasApproximateFuncs()) {
13696 ScaleDownFactor, ZeroInt);
13703 if (
Flags.hasNoInfs()) {
13719 EVT VT =
Op.getValueType();
13730 if (!
V.getValueType().isVector())
13738 if (Subtarget->hasTrigReducedRange()) {
13740 TrigVal = UnrollIfVec(DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags));
13745 switch (
Op.getOpcode()) {
13747 TrigVal = DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
13750 TrigVal = DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13756 return UnrollIfVec(TrigVal);
13776 EVT VT =
Op.getValueType();
13784 Op->getVTList(),
Ops, VT,
13793SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13794 DAGCombinerInfo &DCI)
const {
13795 EVT VT =
N->getValueType(0);
13797 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13800 SelectionDAG &DAG = DCI.DAG;
13804 EVT SrcVT = Src.getValueType();
13810 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13813 DCI.AddToWorklist(Cvt.
getNode());
13816 if (ScalarVT != MVT::f32) {
13828 DAGCombinerInfo &DCI)
const {
13839 SelectionDAG &DAG = DCI.DAG;
13858 for (
unsigned I = 0;
I != NumElts; ++
I) {
13882 if (NewElts.
size() == 1)
13904 for (
unsigned I = 0;
I != NumElts; ++
I) {
13939SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13941 DAGCombinerInfo &DCI)
const {
13958 SelectionDAG &DAG = DCI.DAG;
13971 AM.BaseOffs =
Offset.getSExtValue();
13976 EVT VT =
N->getValueType(0);
13982 Flags.setNoUnsignedWrap(
13983 N->getFlags().hasNoUnsignedWrap() &&
13995 switch (
N->getOpcode()) {
14006 DAGCombinerInfo &DCI)
const {
14007 SelectionDAG &DAG = DCI.DAG;
14014 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
14015 N->getMemoryVT(), DCI);
14019 NewOps[PtrIdx] = NewPtr;
14028 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
14029 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
14038SDValue SITargetLowering::splitBinaryBitConstantOp(
14042 uint32_t ValLo =
Lo_32(Val);
14043 uint32_t ValHi =
Hi_32(Val);
14050 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
14064 if (V.getValueType() != MVT::i1)
14066 switch (V.getOpcode()) {
14071 case AMDGPUISD::FP_CLASS:
14083 return V.getResNo() == 1;
14085 unsigned IntrinsicID = V.getConstantOperandVal(0);
14086 switch (IntrinsicID) {
14087 case Intrinsic::amdgcn_is_shared:
14088 case Intrinsic::amdgcn_is_private:
14105 if (!(
C & 0x000000ff))
14106 ZeroByteMask |= 0x000000ff;
14107 if (!(
C & 0x0000ff00))
14108 ZeroByteMask |= 0x0000ff00;
14109 if (!(
C & 0x00ff0000))
14110 ZeroByteMask |= 0x00ff0000;
14111 if (!(
C & 0xff000000))
14112 ZeroByteMask |= 0xff000000;
14113 uint32_t NonZeroByteMask = ~ZeroByteMask;
14114 if ((NonZeroByteMask &
C) != NonZeroByteMask)
14127 assert(V.getValueSizeInBits() == 32);
14129 if (V.getNumOperands() != 2)
14138 switch (V.getOpcode()) {
14143 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
14148 return (0x03020100 & ~ConstMask) | ConstMask;
14155 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
14161 return uint32_t(0x0c0c0c0c03020100ull >>
C);
14168 DAGCombinerInfo &DCI)
const {
14169 if (DCI.isBeforeLegalize())
14172 SelectionDAG &DAG = DCI.DAG;
14173 EVT VT =
N->getValueType(0);
14178 if (VT == MVT::i64 && CRHS) {
14180 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
14184 if (CRHS && VT == MVT::i32) {
14194 unsigned Shift = CShift->getZExtValue();
14196 unsigned Offset = NB + Shift;
14197 if ((
Offset & (Bits - 1)) == 0) {
14200 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
14221 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
14223 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14236 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
14241 if (
X !=
LHS.getOperand(1))
14245 const ConstantFPSDNode *C1 =
14262 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
14268 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
14271 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14279 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
14280 LHS.getOperand(0) ==
LHS.getOperand(1))) {
14282 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
14283 :
Mask->getZExtValue() & OrdMask;
14286 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
14304 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14307 if (LHSMask != ~0u && RHSMask != ~0u) {
14310 if (LHSMask > RHSMask) {
14317 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14318 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14321 if (!(LHSUsedLanes & RHSUsedLanes) &&
14324 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14330 uint32_t
Mask = LHSMask & RHSMask;
14331 for (
unsigned I = 0;
I < 32;
I += 8) {
14332 uint32_t ByteSel = 0xff <<
I;
14333 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14334 Mask &= (0x0c <<
I) & 0xffffffff;
14339 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
14342 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14392static const std::optional<ByteProvider<SDValue>>
14394 unsigned Depth = 0) {
14397 return std::nullopt;
14399 if (
Op.getValueSizeInBits() < 8)
14400 return std::nullopt;
14402 if (
Op.getValueType().isVector())
14405 switch (
Op->getOpcode()) {
14418 NarrowVT = VTSign->getVT();
14421 return std::nullopt;
14424 if (SrcIndex >= NarrowByteWidth)
14425 return std::nullopt;
14433 return std::nullopt;
14435 uint64_t BitShift = ShiftOp->getZExtValue();
14437 if (BitShift % 8 != 0)
14438 return std::nullopt;
14440 SrcIndex += BitShift / 8;
14458static const std::optional<ByteProvider<SDValue>>
14460 unsigned StartingIndex = 0) {
14464 return std::nullopt;
14466 unsigned BitWidth =
Op.getScalarValueSizeInBits();
14468 return std::nullopt;
14470 return std::nullopt;
14472 bool IsVec =
Op.getValueType().isVector();
14473 switch (
Op.getOpcode()) {
14476 return std::nullopt;
14481 return std::nullopt;
14485 return std::nullopt;
14488 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
14489 return std::nullopt;
14490 if (!
LHS ||
LHS->isConstantZero())
14492 if (!
RHS ||
RHS->isConstantZero())
14494 return std::nullopt;
14499 return std::nullopt;
14503 return std::nullopt;
14505 uint32_t BitMask = BitMaskOp->getZExtValue();
14507 uint32_t IndexMask = 0xFF << (Index * 8);
14509 if ((IndexMask & BitMask) != IndexMask) {
14512 if (IndexMask & BitMask)
14513 return std::nullopt;
14522 return std::nullopt;
14526 if (!ShiftOp ||
Op.getValueType().isVector())
14527 return std::nullopt;
14529 uint64_t BitsProvided =
Op.getValueSizeInBits();
14530 if (BitsProvided % 8 != 0)
14531 return std::nullopt;
14533 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14535 return std::nullopt;
14537 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14538 uint64_t ByteShift = BitShift / 8;
14540 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14541 uint64_t BytesProvided = BitsProvided / 8;
14542 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14543 NewIndex %= BytesProvided;
14550 return std::nullopt;
14554 return std::nullopt;
14556 uint64_t BitShift = ShiftOp->getZExtValue();
14558 return std::nullopt;
14560 auto BitsProvided =
Op.getScalarValueSizeInBits();
14561 if (BitsProvided % 8 != 0)
14562 return std::nullopt;
14564 uint64_t BytesProvided = BitsProvided / 8;
14565 uint64_t ByteShift = BitShift / 8;
14570 return BytesProvided - ByteShift > Index
14578 return std::nullopt;
14582 return std::nullopt;
14584 uint64_t BitShift = ShiftOp->getZExtValue();
14585 if (BitShift % 8 != 0)
14586 return std::nullopt;
14587 uint64_t ByteShift = BitShift / 8;
14593 return Index < ByteShift
14596 Depth + 1, StartingIndex);
14605 return std::nullopt;
14613 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14615 if (NarrowBitWidth % 8 != 0)
14616 return std::nullopt;
14617 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14619 if (Index >= NarrowByteWidth)
14621 ? std::optional<ByteProvider<SDValue>>(
14629 return std::nullopt;
14633 if (NarrowByteWidth >= Index) {
14638 return std::nullopt;
14645 return std::nullopt;
14651 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14652 if (NarrowBitWidth % 8 != 0)
14653 return std::nullopt;
14654 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14659 if (Index >= NarrowByteWidth) {
14661 ? std::optional<ByteProvider<SDValue>>(
14666 if (NarrowByteWidth > Index) {
14670 return std::nullopt;
14675 return std::nullopt;
14678 Depth + 1, StartingIndex);
14684 return std::nullopt;
14685 auto VecIdx = IdxOp->getZExtValue();
14686 auto ScalarSize =
Op.getScalarValueSizeInBits();
14687 if (ScalarSize < 32)
14688 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14690 StartingIndex, Index);
14693 case AMDGPUISD::PERM: {
14695 return std::nullopt;
14699 return std::nullopt;
14702 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14703 if (IdxMask > 0x07 && IdxMask != 0x0c)
14704 return std::nullopt;
14706 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14707 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14709 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
14715 return std::nullopt;
14730 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
14737 auto MemVT = L->getMemoryVT();
14740 return L->getMemoryVT().getSizeInBits() == 16;
14750 int Low8 = Mask & 0xff;
14751 int Hi8 = (Mask & 0xff00) >> 8;
14753 assert(Low8 < 8 && Hi8 < 8);
14755 bool IsConsecutive = (Hi8 - Low8 == 1);
14760 bool Is16Aligned = !(Low8 % 2);
14762 return IsConsecutive && Is16Aligned;
14770 int Low16 = PermMask & 0xffff;
14771 int Hi16 = (PermMask & 0xffff0000) >> 16;
14781 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14783 if (!OtherOpIs16Bit)
14791 unsigned DWordOffset) {
14796 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14801 if (Src.getValueType().isVector()) {
14802 auto ScalarTySize = Src.getScalarValueSizeInBits();
14803 auto ScalarTy = Src.getValueType().getScalarType();
14804 if (ScalarTySize == 32) {
14808 if (ScalarTySize > 32) {
14811 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14812 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14819 assert(ScalarTySize < 32);
14820 auto NumElements =
TypeSize / ScalarTySize;
14821 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14822 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14823 auto NumElementsIn32 = 32 / ScalarTySize;
14824 auto NumAvailElements = DWordOffset < Trunc32Elements
14826 : NumElements - NormalizedTrunc;
14839 auto ShiftVal = 32 * DWordOffset;
14847 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14852 for (
int i = 0; i < 4; i++) {
14854 std::optional<ByteProvider<SDValue>>
P =
14857 if (!
P ||
P->isConstantZero())
14862 if (PermNodes.
size() != 4)
14865 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14866 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14868 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14869 auto PermOp = PermNodes[i];
14872 int SrcByteAdjust = 4;
14876 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14877 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14879 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14880 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14884 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14885 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14888 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14890 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14893 SDValue Op = *PermNodes[FirstSrc.first].Src;
14895 assert(
Op.getValueSizeInBits() == 32);
14899 int Low16 = PermMask & 0xffff;
14900 int Hi16 = (PermMask & 0xffff0000) >> 16;
14902 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14903 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14906 if (WellFormedLow && WellFormedHi)
14910 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14919 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14920 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14925 assert(
Op.getValueType().isByteSized() &&
14936 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14943 DAGCombinerInfo &DCI)
const {
14944 SelectionDAG &DAG = DCI.DAG;
14948 EVT VT =
N->getValueType(0);
14949 if (VT == MVT::i1) {
14951 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14952 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14954 if (Src !=
RHS.getOperand(0))
14959 if (!CLHS || !CRHS)
14963 static const uint32_t MaxMask = 0x3ff;
14968 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14977 LHS.getOpcode() == AMDGPUISD::PERM &&
14983 Sel |=
LHS.getConstantOperandVal(2);
14985 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14992 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14996 auto usesCombinedOperand = [](SDNode *OrUse) {
14999 !OrUse->getValueType(0).isVector())
15003 for (
auto *VUser : OrUse->users()) {
15004 if (!VUser->getValueType(0).isVector())
15011 if (VUser->getOpcode() == VectorwiseOp)
15017 if (!
any_of(
N->users(), usesCombinedOperand))
15023 if (LHSMask != ~0u && RHSMask != ~0u) {
15026 if (LHSMask > RHSMask) {
15033 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15034 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15037 if (!(LHSUsedLanes & RHSUsedLanes) &&
15040 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
15042 LHSMask &= ~RHSUsedLanes;
15043 RHSMask &= ~LHSUsedLanes;
15045 LHSMask |= LHSUsedLanes & 0x04040404;
15047 uint32_t Sel = LHSMask | RHSMask;
15050 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
15055 if (LHSMask == ~0u || RHSMask == ~0u) {
15096 return IdentitySrc;
15102 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
15117 if (SrcVT == MVT::i32) {
15122 DCI.AddToWorklist(LowOr.
getNode());
15123 DCI.AddToWorklist(HiBits.getNode());
15134 N->getOperand(0), CRHS))
15142 DAGCombinerInfo &DCI)
const {
15143 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
15150 SelectionDAG &DAG = DCI.DAG;
15152 EVT VT =
N->getValueType(0);
15153 if (CRHS && VT == MVT::i64) {
15155 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
15162 unsigned Opc =
LHS.getOpcode();
15192 LHS->getOperand(0), FNegLHS, FNegRHS);
15201SITargetLowering::performZeroOrAnyExtendCombine(
SDNode *
N,
15202 DAGCombinerInfo &DCI)
const {
15203 if (!Subtarget->has16BitInsts() ||
15207 EVT VT =
N->getValueType(0);
15208 if (VT != MVT::i32)
15212 if (Src.getValueType() != MVT::i16)
15215 if (!Src->hasOneUse())
15222 std::optional<ByteProvider<SDValue>> BP0 =
15224 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
15228 std::optional<ByteProvider<SDValue>> BP1 =
15230 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
15238 SelectionDAG &DAG = DCI.DAG;
15240 uint32_t PermMask = 0x0c0c0c0c;
15243 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
15248 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
15251 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32, V0, V1,
15256SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
15257 DAGCombinerInfo &DCI)
const {
15263 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
15264 VTSign->getVT() == MVT::i8) ||
15265 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
15266 VTSign->getVT() == MVT::i16))) {
15267 assert(Subtarget->hasScalarSubwordLoads() &&
15268 "s_buffer_load_{u8, i8} are supported "
15269 "in GFX12 (or newer) architectures.");
15270 EVT VT = Src.getValueType();
15271 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
15272 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15273 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15275 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15282 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15283 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15287 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15288 VTSign->getVT() == MVT::i8) ||
15289 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15290 VTSign->getVT() == MVT::i16)) &&
15299 Src.getOperand(6), Src.getOperand(7)};
15302 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15303 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15304 ? AMDGPUISD::BUFFER_LOAD_BYTE
15305 : AMDGPUISD::BUFFER_LOAD_SHORT;
15306 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15307 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15308 return DCI.DAG.getMergeValues(
15309 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
15315 DAGCombinerInfo &DCI)
const {
15316 SelectionDAG &DAG = DCI.DAG;
15323 if (
N->getOperand(0).isUndef())
15330 DAGCombinerInfo &DCI)
const {
15331 EVT VT =
N->getValueType(0);
15341 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
15348 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
15357 unsigned MaxDepth)
const {
15358 unsigned Opcode =
Op.getOpcode();
15363 const auto &
F = CFP->getValueAPF();
15364 if (
F.isNaN() &&
F.isSignaling())
15366 if (!
F.isDenormal())
15398 case AMDGPUISD::FMUL_LEGACY:
15399 case AMDGPUISD::FMAD_FTZ:
15400 case AMDGPUISD::RCP:
15401 case AMDGPUISD::RSQ:
15402 case AMDGPUISD::RSQ_CLAMP:
15403 case AMDGPUISD::RCP_LEGACY:
15404 case AMDGPUISD::RCP_IFLAG:
15405 case AMDGPUISD::LOG:
15406 case AMDGPUISD::EXP:
15407 case AMDGPUISD::DIV_SCALE:
15408 case AMDGPUISD::DIV_FMAS:
15409 case AMDGPUISD::DIV_FIXUP:
15410 case AMDGPUISD::FRACT:
15411 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15412 case AMDGPUISD::CVT_F32_UBYTE0:
15413 case AMDGPUISD::CVT_F32_UBYTE1:
15414 case AMDGPUISD::CVT_F32_UBYTE2:
15415 case AMDGPUISD::CVT_F32_UBYTE3:
15416 case AMDGPUISD::FP_TO_FP16:
15417 case AMDGPUISD::SIN_HW:
15418 case AMDGPUISD::COS_HW:
15429 if (
Op.getValueType() == MVT::i32) {
15435 if (RHS->getZExtValue() == 0xffff0000) {
15445 return Op.getValueType().getScalarType() != MVT::f16;
15455 case AMDGPUISD::CLAMP:
15456 case AMDGPUISD::FMED3:
15457 case AMDGPUISD::FMAX3:
15458 case AMDGPUISD::FMIN3:
15459 case AMDGPUISD::FMAXIMUM3:
15460 case AMDGPUISD::FMINIMUM3: {
15466 if (Subtarget->supportsMinMaxDenormModes() ||
15476 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
15488 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
15515 if (
Op.getValueType() == MVT::i16) {
15526 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
15528 switch (IntrinsicID) {
15529 case Intrinsic::amdgcn_cvt_pkrtz:
15530 case Intrinsic::amdgcn_cubeid:
15531 case Intrinsic::amdgcn_frexp_mant:
15532 case Intrinsic::amdgcn_fdot2:
15533 case Intrinsic::amdgcn_rcp:
15534 case Intrinsic::amdgcn_rsq:
15535 case Intrinsic::amdgcn_rsq_clamp:
15536 case Intrinsic::amdgcn_rcp_legacy:
15537 case Intrinsic::amdgcn_rsq_legacy:
15538 case Intrinsic::amdgcn_trig_preop:
15539 case Intrinsic::amdgcn_tanh:
15540 case Intrinsic::amdgcn_log:
15541 case Intrinsic::amdgcn_exp2:
15542 case Intrinsic::amdgcn_sqrt:
15560 unsigned MaxDepth)
const {
15563 unsigned Opcode =
MI->getOpcode();
15565 if (Opcode == AMDGPU::G_FCANONICALIZE)
15568 std::optional<FPValueAndVReg> FCR;
15571 if (FCR->Value.isSignaling())
15573 if (!FCR->Value.isDenormal())
15584 case AMDGPU::G_FADD:
15585 case AMDGPU::G_FSUB:
15586 case AMDGPU::G_FMUL:
15587 case AMDGPU::G_FCEIL:
15588 case AMDGPU::G_FFLOOR:
15589 case AMDGPU::G_FRINT:
15590 case AMDGPU::G_FNEARBYINT:
15591 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15592 case AMDGPU::G_INTRINSIC_TRUNC:
15593 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15594 case AMDGPU::G_FMA:
15595 case AMDGPU::G_FMAD:
15596 case AMDGPU::G_FSQRT:
15597 case AMDGPU::G_FDIV:
15598 case AMDGPU::G_FREM:
15599 case AMDGPU::G_FPOW:
15600 case AMDGPU::G_FPEXT:
15601 case AMDGPU::G_FLOG:
15602 case AMDGPU::G_FLOG2:
15603 case AMDGPU::G_FLOG10:
15604 case AMDGPU::G_FPTRUNC:
15605 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15606 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15607 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15608 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15609 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15611 case AMDGPU::G_FNEG:
15612 case AMDGPU::G_FABS:
15613 case AMDGPU::G_FCOPYSIGN:
15615 case AMDGPU::G_FMINNUM:
15616 case AMDGPU::G_FMAXNUM:
15617 case AMDGPU::G_FMINNUM_IEEE:
15618 case AMDGPU::G_FMAXNUM_IEEE:
15619 case AMDGPU::G_FMINIMUM:
15620 case AMDGPU::G_FMAXIMUM:
15621 case AMDGPU::G_FMINIMUMNUM:
15622 case AMDGPU::G_FMAXIMUMNUM: {
15623 if (Subtarget->supportsMinMaxDenormModes() ||
15630 case AMDGPU::G_BUILD_VECTOR:
15635 case AMDGPU::G_INTRINSIC:
15636 case AMDGPU::G_INTRINSIC_CONVERGENT:
15638 case Intrinsic::amdgcn_fmul_legacy:
15639 case Intrinsic::amdgcn_fmad_ftz:
15640 case Intrinsic::amdgcn_sqrt:
15641 case Intrinsic::amdgcn_fmed3:
15642 case Intrinsic::amdgcn_sin:
15643 case Intrinsic::amdgcn_cos:
15644 case Intrinsic::amdgcn_log:
15645 case Intrinsic::amdgcn_exp2:
15646 case Intrinsic::amdgcn_log_clamp:
15647 case Intrinsic::amdgcn_rcp:
15648 case Intrinsic::amdgcn_rcp_legacy:
15649 case Intrinsic::amdgcn_rsq:
15650 case Intrinsic::amdgcn_rsq_clamp:
15651 case Intrinsic::amdgcn_rsq_legacy:
15652 case Intrinsic::amdgcn_div_scale:
15653 case Intrinsic::amdgcn_div_fmas:
15654 case Intrinsic::amdgcn_div_fixup:
15655 case Intrinsic::amdgcn_fract:
15656 case Intrinsic::amdgcn_cvt_pkrtz:
15657 case Intrinsic::amdgcn_cubeid:
15658 case Intrinsic::amdgcn_cubema:
15659 case Intrinsic::amdgcn_cubesc:
15660 case Intrinsic::amdgcn_cubetc:
15661 case Intrinsic::amdgcn_frexp_mant:
15662 case Intrinsic::amdgcn_fdot2:
15663 case Intrinsic::amdgcn_trig_preop:
15664 case Intrinsic::amdgcn_tanh:
15683 if (
C.isDenormal()) {
15697 if (
C.isSignaling()) {
15720SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
15721 DAGCombinerInfo &DCI)
const {
15722 SelectionDAG &DAG = DCI.DAG;
15724 EVT VT =
N->getValueType(0);
15733 EVT VT =
N->getValueType(0);
15734 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
15750 EVT EltVT =
Lo.getValueType();
15753 for (
unsigned I = 0;
I != 2; ++
I) {
15757 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15758 }
else if (
Op.isUndef()) {
15794 return AMDGPUISD::FMAX3;
15796 return AMDGPUISD::FMAXIMUM3;
15798 return AMDGPUISD::SMAX3;
15800 return AMDGPUISD::UMAX3;
15804 return AMDGPUISD::FMIN3;
15806 return AMDGPUISD::FMINIMUM3;
15808 return AMDGPUISD::SMIN3;
15810 return AMDGPUISD::UMIN3;
15831 if (!MinK || !MaxK)
15843 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15844 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15845 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15869 bool IsKnownNoNaNs)
const {
15905 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15911 if (
Info->getMode().DX10Clamp) {
15920 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15952 case AMDGPUISD::FMIN_LEGACY:
15953 case AMDGPUISD::FMAX_LEGACY:
15954 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15955 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15958 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15959 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15960 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15965 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15974 DAGCombinerInfo &DCI)
const {
15975 SelectionDAG &DAG = DCI.DAG;
15986 auto IsTreeWithCombinableChildren = [
Opc](
SDValue Op) {
15987 return Op.getOperand(0).getOpcode() ==
Opc &&
15988 Op.getOperand(1).getOpcode() ==
Opc &&
15989 (
Op.getOperand(0).hasOneUse() ||
Op.getOperand(1).hasOneUse());
16000 Op1.
hasOneUse() && !IsTreeWithCombinableChildren(Op0) &&
16001 !IsTreeWithCombinableChildren(Op1)) {
16013 !IsTreeWithCombinableChildren(Op0)) {
16024 !IsTreeWithCombinableChildren(Op1)) {
16033 uint64_t Clamp = 0;
16049 if (
SDValue Med3 = performIntMed3ImmCombine(
16054 if (
SDValue Med3 = performIntMed3ImmCombine(
16060 if (
SDValue Med3 = performIntMed3ImmCombine(
16065 if (
SDValue Med3 = performIntMed3ImmCombine(
16078 (
Opc == AMDGPUISD::FMIN_LEGACY &&
16079 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
16080 (VT == MVT::f32 || VT == MVT::f64 ||
16081 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
16082 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
16083 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
16084 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
16086 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1,
16087 N->getFlags().hasNoNaNs()))
16094 const SDNodeFlags
Flags =
N->getFlags();
16096 !Subtarget->hasIEEEMinimumMaximumInsts() &&
16100 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
16110 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
16111 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
16120 DAGCombinerInfo &DCI)
const {
16121 EVT VT =
N->getValueType(0);
16125 SelectionDAG &DAG = DCI.DAG;
16136 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
16140 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
16144 if (
Info->getMode().DX10Clamp) {
16157 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
16164 DAGCombinerInfo &DCI)
const {
16168 return DCI.DAG.getUNDEF(
N->getValueType(0));
16176 bool IsDivergentIdx,
16181 unsigned VecSize = EltSize * NumElem;
16184 if (VecSize <= 64 && EltSize < 32)
16193 if (IsDivergentIdx)
16197 unsigned NumInsts = NumElem +
16198 ((EltSize + 31) / 32) * NumElem ;
16202 if (Subtarget->useVGPRIndexMode())
16203 return NumInsts <= 16;
16207 if (Subtarget->hasMovrel())
16208 return NumInsts <= 15;
16214 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
16229SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
16230 DAGCombinerInfo &DCI)
const {
16236 EVT ResVT =
N->getValueType(0);
16260 if (!
C ||
C->getZExtValue() != 0x1f)
16276 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
16304 DCI.AddToWorklist(Elt0.
getNode());
16305 DCI.AddToWorklist(Elt1.
getNode());
16336 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16337 uint64_t KImmValue = KImm->getZExtValue();
16339 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16342 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16343 uint64_t KFPImmValue =
16344 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16345 return DAG.
getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16351 if (!DCI.isBeforeLegalize())
16358 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16361 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16362 unsigned EltIdx = BitIndex / 32;
16363 unsigned LeftoverBitIdx = BitIndex % 32;
16367 DCI.AddToWorklist(Cast.
getNode());
16371 DCI.AddToWorklist(Elt.
getNode());
16374 DCI.AddToWorklist(Srl.
getNode());
16378 DCI.AddToWorklist(Trunc.
getNode());
16380 if (VecEltVT == ResVT) {
16392SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
16393 DAGCombinerInfo &DCI)
const {
16404 SelectionDAG &DAG = DCI.DAG;
16424 Src.getOperand(0).getValueType() == MVT::f16) {
16425 return Src.getOperand(0);
16429 APFloat Val = CFP->getValueAPF();
16430 bool LosesInfo =
true;
16440 DAGCombinerInfo &DCI)
const {
16441 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16442 "combine only useful on gfx8");
16444 SDValue TruncSrc =
N->getOperand(0);
16445 EVT VT =
N->getValueType(0);
16446 if (VT != MVT::f16)
16449 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
16453 SelectionDAG &DAG = DCI.DAG;
16484unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
16486 const SDNode *N1)
const {
16491 if (((VT == MVT::f32 &&
16493 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16513 EVT VT =
N->getValueType(0);
16514 if (VT != MVT::i32 && VT != MVT::i64)
16520 unsigned Opc =
N->getOpcode();
16575 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
16594 DAGCombinerInfo &DCI)
const {
16597 SelectionDAG &DAG = DCI.DAG;
16598 EVT VT =
N->getValueType(0);
16608 if (!
N->isDivergent() && Subtarget->hasSMulHi())
16612 if (NumBits <= 32 || NumBits > 64)
16623 if (!Subtarget->hasFullRate64Ops()) {
16624 unsigned NumUsers = 0;
16625 for (SDNode *User :
LHS->
users()) {
16628 if (!
User->isAnyAdd())
16652 bool MulSignedLo =
false;
16653 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16662 if (VT != MVT::i64) {
16685 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16687 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16688 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16690 if (!MulLHSUnsigned32) {
16697 if (!MulRHSUnsigned32) {
16708 if (VT != MVT::i64)
16714SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
16715 DAGCombinerInfo &DCI)
const {
16725 SelectionDAG &DAG = DCI.DAG;
16740 unsigned Opcode =
N->getOpcode();
16744 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
16755static std::optional<ByteProvider<SDValue>>
16758 if (!Byte0 || Byte0->isConstantZero()) {
16759 return std::nullopt;
16762 if (Byte1 && !Byte1->isConstantZero()) {
16763 return std::nullopt;
16769 unsigned FirstCs =
First & 0x0c0c0c0c;
16770 unsigned SecondCs = Second & 0x0c0c0c0c;
16771 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
16772 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16774 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16775 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16776 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16777 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16779 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16803 for (
int BPI = 0; BPI < 2; BPI++) {
16806 BPP = {Src1, Src0};
16808 unsigned ZeroMask = 0x0c0c0c0c;
16809 unsigned FMask = 0xFF << (8 * (3 - Step));
16811 unsigned FirstMask =
16812 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16813 unsigned SecondMask =
16814 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16818 int FirstGroup = -1;
16819 for (
int I = 0;
I < 2;
I++) {
16821 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
16822 return IterElt.SrcOp == *BPP.first.Src &&
16823 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16827 if (Match != Srcs.
end()) {
16828 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
16833 if (FirstGroup != -1) {
16835 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
16836 return IterElt.SrcOp == *BPP.second.Src &&
16837 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16840 if (Match != Srcs.
end()) {
16841 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
16843 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16851 unsigned ZeroMask = 0x0c0c0c0c;
16852 unsigned FMask = 0xFF << (8 * (3 - Step));
16856 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16860 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16869 if (Srcs.
size() == 1) {
16870 auto *Elt = Srcs.
begin();
16874 if (Elt->PermMask == 0x3020100)
16877 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16881 auto *FirstElt = Srcs.
begin();
16882 auto *SecondElt = std::next(FirstElt);
16889 auto FirstMask = FirstElt->PermMask;
16890 auto SecondMask = SecondElt->PermMask;
16892 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16893 unsigned FirstPlusFour = FirstMask | 0x04040404;
16896 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16908 FirstElt = std::next(SecondElt);
16909 if (FirstElt == Srcs.
end())
16912 SecondElt = std::next(FirstElt);
16915 if (SecondElt == Srcs.
end()) {
16920 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16921 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16927 return Perms.
size() == 2
16933 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16934 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16935 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16936 EntryMask += ZeroMask;
16941 auto Opcode =
Op.getOpcode();
16943 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16944 Opcode == AMDGPUISD::MUL_I24);
16947static std::optional<bool>
16958 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16961 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16963 assert(!(S0IsUnsigned && S0IsSigned));
16964 assert(!(S1IsUnsigned && S1IsSigned));
16972 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16978 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16979 return std::nullopt;
16991 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16992 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16997 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
17003 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
17004 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
17005 return std::nullopt;
17011 DAGCombinerInfo &DCI)
const {
17012 SelectionDAG &DAG = DCI.DAG;
17013 EVT VT =
N->getValueType(0);
17019 if (Subtarget->hasMad64_32()) {
17020 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
17025 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
17029 if (VT == MVT::i64) {
17030 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17035 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
17037 std::optional<bool> IsSigned;
17043 int ChainLength = 0;
17044 for (
int I = 0;
I < 4;
I++) {
17048 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
17051 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
17056 TempNode->getOperand(MulIdx), *Src0, *Src1,
17057 TempNode->getOperand(MulIdx)->getOperand(0),
17058 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
17062 IsSigned = *IterIsSigned;
17063 if (*IterIsSigned != *IsSigned)
17066 auto AddIdx = 1 - MulIdx;
17069 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
17070 Src2s.
push_back(TempNode->getOperand(AddIdx));
17080 TempNode->getOperand(AddIdx), *Src0, *Src1,
17081 TempNode->getOperand(AddIdx)->getOperand(0),
17082 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
17086 if (*IterIsSigned != *IsSigned)
17090 ChainLength =
I + 2;
17094 TempNode = TempNode->getOperand(AddIdx);
17096 ChainLength =
I + 1;
17097 if (TempNode->getNumOperands() < 2)
17099 LHS = TempNode->getOperand(0);
17100 RHS = TempNode->getOperand(1);
17103 if (ChainLength < 2)
17109 if (ChainLength < 4) {
17119 bool UseOriginalSrc =
false;
17120 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
17121 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
17122 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
17123 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
17124 SmallVector<unsigned, 4> SrcBytes;
17125 auto Src0Mask = Src0s.
begin()->PermMask;
17126 SrcBytes.
push_back(Src0Mask & 0xFF000000);
17127 bool UniqueEntries =
true;
17128 for (
auto I = 1;
I < 4;
I++) {
17129 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
17132 UniqueEntries =
false;
17138 if (UniqueEntries) {
17139 UseOriginalSrc =
true;
17141 auto *FirstElt = Src0s.
begin();
17145 auto *SecondElt = Src1s.
begin();
17147 SecondElt->DWordOffset);
17156 if (!UseOriginalSrc) {
17163 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
17166 : Intrinsic::amdgcn_udot4,
17176 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
17181 unsigned Opc =
LHS.getOpcode();
17193 auto Cond =
RHS.getOperand(0);
17198 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
17215 DAGCombinerInfo &DCI)
const {
17216 SelectionDAG &DAG = DCI.DAG;
17218 EVT VT =
N->getValueType(0);
17231 SDNodeFlags ShlFlags = N1->
getFlags();
17235 SDNodeFlags NewShlFlags =
17240 DCI.AddToWorklist(Inner.
getNode());
17247 if (Subtarget->hasMad64_32()) {
17248 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
17257 if (VT == MVT::i64) {
17258 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17271 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
17272 Y->isDivergent() !=
Z->isDivergent()) {
17281 if (
Y->isDivergent())
17284 SDNodeFlags ReassocFlags =
17287 DCI.AddToWorklist(UniformInner.
getNode());
17299 DAGCombinerInfo &DCI)
const {
17300 SelectionDAG &DAG = DCI.DAG;
17301 EVT VT =
N->getValueType(0);
17303 if (VT == MVT::i64) {
17304 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17308 if (VT != MVT::i32)
17317 unsigned Opc =
RHS.getOpcode();
17324 auto Cond =
RHS.getOperand(0);
17329 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
17355 ConstantSDNode *ShiftAmt =
17357 unsigned BitWidth =
X.getValueType().getScalarSizeInBits();
17368SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
17369 DAGCombinerInfo &DCI)
const {
17371 if (
N->getValueType(0) != MVT::i32)
17377 SelectionDAG &DAG = DCI.DAG;
17382 unsigned LHSOpc =
LHS.getOpcode();
17383 unsigned Opc =
N->getOpcode();
17387 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
17393 DAGCombinerInfo &DCI)
const {
17397 SelectionDAG &DAG = DCI.DAG;
17398 EVT VT =
N->getValueType(0);
17410 if (
A ==
LHS.getOperand(1)) {
17411 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17412 if (FusedOp != 0) {
17414 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
17422 if (
A ==
RHS.getOperand(1)) {
17423 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17424 if (FusedOp != 0) {
17426 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
17435 DAGCombinerInfo &DCI)
const {
17439 SelectionDAG &DAG = DCI.DAG;
17441 EVT VT =
N->getValueType(0);
17454 if (
A ==
LHS.getOperand(1)) {
17455 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17456 if (FusedOp != 0) {
17460 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
17469 if (
A ==
RHS.getOperand(1)) {
17470 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17471 if (FusedOp != 0) {
17473 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
17482 DAGCombinerInfo &DCI)
const {
17483 SelectionDAG &DAG = DCI.DAG;
17485 EVT VT =
N->getValueType(0);
17494 SDNodeFlags
Flags =
N->getFlags();
17495 SDNodeFlags RHSFlags =
RHS->getFlags();
17501 bool IsNegative =
false;
17502 if (CLHS->isExactlyValue(1.0) ||
17503 (IsNegative = CLHS->isExactlyValue(-1.0))) {
17509 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
17519 DAGCombinerInfo &DCI)
const {
17520 SelectionDAG &DAG = DCI.DAG;
17521 EVT VT =
N->getValueType(0);
17525 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
17526 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17541 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17546 const ConstantFPSDNode *FalseNode =
17556 if (ScalarVT == MVT::f32 &&
17562 if (TrueNodeExpVal == INT_MIN)
17565 if (FalseNodeExpVal == INT_MIN)
17585 DAGCombinerInfo &DCI)
const {
17586 SelectionDAG &DAG = DCI.DAG;
17587 EVT VT =
N->getValueType(0);
17590 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17608 (
N->getFlags().hasAllowContract() &&
17609 FMA->getFlags().hasAllowContract())) {
17643 if (Vec1 == Vec2 || Vec3 == Vec4)
17649 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17650 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17693 EVT VT =
LHS.getValueType();
17694 assert(VT == MVT::f64 &&
"Incorrect operand type!");
17726 if (CC ==
ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)
17730 if (CC ==
ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))
17734 if (CC ==
ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))
17738 if (CC ==
ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)
17741 const std::optional<bool> KnownEq =
17770 if (CC ==
ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))
17774 if (CC ==
ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))
17782 const std::optional<bool> KnownUge =
17807 if (CC ==
ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))
17821 if (CC ==
ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))
17824 const std::optional<bool> KnownUle =
17847 DAGCombinerInfo &DCI)
const {
17848 SelectionDAG &DAG = DCI.DAG;
17853 EVT VT =
LHS.getValueType();
17882 return LHS.getOperand(0);
17896 const APInt &CT =
LHS.getConstantOperandAPInt(1);
17897 const APInt &CF =
LHS.getConstantOperandAPInt(2);
17902 return DAG.
getNOT(SL,
LHS.getOperand(0), MVT::i1);
17905 return LHS.getOperand(0);
17926 if (VT == MVT::i64) {
17938 const std::optional<bool> KnownEq =
17946 const std::optional<bool> KnownEq =
17957 const std::optional<bool> KnownUge =
17977 const std::optional<bool> KnownUle =
18028 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
18033 {Op0Hi, Op1Hi, CarryInHi});
18043 DCI.CombineTo(
LHS.getNode(), Result);
18047 if (VT != MVT::f32 && VT != MVT::f64 &&
18048 (!Subtarget->has16BitInsts() || VT != MVT::f16))
18063 const unsigned IsInfMask =
18065 const unsigned IsFiniteMask =
18070 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
18075 if (VT == MVT::f64) {
18086SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
18087 DAGCombinerInfo &DCI)
const {
18088 SelectionDAG &DAG = DCI.DAG;
18090 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
18109 unsigned ShiftOffset = 8 *
Offset;
18111 ShiftOffset -=
C->getZExtValue();
18113 ShiftOffset +=
C->getZExtValue();
18115 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
18116 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
18117 MVT::f32, Shifted);
18128 DCI.AddToWorklist(
N);
18135 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
18141 DAGCombinerInfo &DCI)
const {
18146 const MachineFunction &MF = DCI.DAG.getMachineFunction();
18150 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
18151 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
18154 APFloat One(
F.getSemantics(),
"1.0");
18156 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
18162 DAGCombinerInfo &DCI)
const {
18183 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
18184 bool isInteger =
LHS.getValueType().isInteger();
18187 if (!isFloatingPoint && !isInteger)
18192 if (!isEquality && !isNonEquality)
18209 if (isFloatingPoint) {
18211 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
18222 if (!(isEquality && TrueVal == ConstVal) &&
18223 !(isNonEquality && FalseVal == ConstVal))
18230 SelectLHS, SelectRHS);
18235 switch (
N->getOpcode()) {
18251 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
18261 switch (
N->getOpcode()) {
18263 return performAddCombine(
N, DCI);
18265 return performPtrAddCombine(
N, DCI);
18267 return performSubCombine(
N, DCI);
18270 return performAddCarrySubCarryCombine(
N, DCI);
18272 return performFAddCombine(
N, DCI);
18274 return performFSubCombine(
N, DCI);
18276 return performFDivCombine(
N, DCI);
18278 return performFMulCombine(
N, DCI);
18280 return performSetCCCombine(
N, DCI);
18282 if (
auto Res = performSelectCombine(
N, DCI))
18297 case AMDGPUISD::FMIN_LEGACY:
18298 case AMDGPUISD::FMAX_LEGACY:
18299 return performMinMaxCombine(
N, DCI);
18301 return performFMACombine(
N, DCI);
18303 return performAndCombine(
N, DCI);
18305 return performOrCombine(
N, DCI);
18308 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
18309 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
18315 return performXorCombine(
N, DCI);
18318 return performZeroOrAnyExtendCombine(
N, DCI);
18320 return performSignExtendInRegCombine(
N, DCI);
18321 case AMDGPUISD::FP_CLASS:
18322 return performClassCombine(
N, DCI);
18324 return performFCanonicalizeCombine(
N, DCI);
18325 case AMDGPUISD::RCP:
18326 return performRcpCombine(
N, DCI);
18328 case AMDGPUISD::FRACT:
18329 case AMDGPUISD::RSQ:
18330 case AMDGPUISD::RCP_LEGACY:
18331 case AMDGPUISD::RCP_IFLAG:
18332 case AMDGPUISD::RSQ_CLAMP: {
18341 return performUCharToFloatCombine(
N, DCI);
18343 return performFCopySignCombine(
N, DCI);
18344 case AMDGPUISD::CVT_F32_UBYTE0:
18345 case AMDGPUISD::CVT_F32_UBYTE1:
18346 case AMDGPUISD::CVT_F32_UBYTE2:
18347 case AMDGPUISD::CVT_F32_UBYTE3:
18348 return performCvtF32UByteNCombine(
N, DCI);
18349 case AMDGPUISD::FMED3:
18350 return performFMed3Combine(
N, DCI);
18351 case AMDGPUISD::CVT_PKRTZ_F16_F32:
18352 return performCvtPkRTZCombine(
N, DCI);
18353 case AMDGPUISD::CLAMP:
18354 return performClampCombine(
N, DCI);
18357 EVT VT =
N->getValueType(0);
18360 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
18363 EVT EltVT = Src.getValueType();
18364 if (EltVT != MVT::i16)
18374 return performExtractVectorEltCombine(
N, DCI);
18376 return performInsertVectorEltCombine(
N, DCI);
18378 return performFPRoundCombine(
N, DCI);
18387 return performMemSDNodeCombine(MemNode, DCI);
18418 unsigned Opcode =
Node->getMachineOpcode();
18421 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
18422 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
18425 SDNode *
Users[5] = {
nullptr};
18427 unsigned DmaskIdx =
18428 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
18429 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
18430 unsigned NewDmask = 0;
18431 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
18432 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
18433 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
18434 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
18435 unsigned TFCLane = 0;
18436 bool HasChain =
Node->getNumValues() > 1;
18438 if (OldDmask == 0) {
18446 TFCLane = OldBitsSet;
18450 for (SDUse &Use :
Node->uses()) {
18453 if (
Use.getResNo() != 0)
18456 SDNode *
User =
Use.getUser();
18459 if (!
User->isMachineOpcode() ||
18460 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
18472 if (UsesTFC && Lane == TFCLane) {
18477 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
18479 Dmask &= ~(1 << Comp);
18487 NewDmask |= 1 << Comp;
18492 bool NoChannels = !NewDmask;
18499 if (OldBitsSet == 1)
18505 if (NewDmask == OldDmask)
18514 unsigned NewChannels = BitsSet + UsesTFC;
18518 assert(NewOpcode != -1 &&
18519 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
18520 "failed to find equivalent MIMG op");
18528 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
18530 MVT ResultVT = NewChannels == 1
18533 : NewChannels == 5 ? 8
18535 SDVTList NewVTList =
18538 MachineSDNode *NewNode =
18547 if (NewChannels == 1) {
18557 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18562 if (i || !NoChannels)
18567 if (NewUser != User) {
18577 Idx = AMDGPU::sub1;
18580 Idx = AMDGPU::sub2;
18583 Idx = AMDGPU::sub3;
18586 Idx = AMDGPU::sub4;
18597 Op =
Op.getOperand(0);
18622 Node->getOperand(0), SL, VReg, SrcVal,
18628 return ToResultReg.
getNode();
18633 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
18635 Ops.push_back(
Node->getOperand(i));
18641 Node->getOperand(i).getValueType(),
18642 Node->getOperand(i)),
18654 unsigned Opcode =
Node->getMachineOpcode();
18656 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
18657 !
TII->isGather4(Opcode) &&
18659 return adjustWritemask(
Node, DAG);
18662 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
18668 case AMDGPU::V_DIV_SCALE_F32_e64:
18669 case AMDGPU::V_DIV_SCALE_F64_e64: {
18679 (Src0 == Src1 || Src0 == Src2))
18735 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
18736 unsigned InitIdx = 0;
18738 if (
TII->isImage(
MI)) {
18746 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
18747 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
18748 unsigned D16Val = D16 ? D16->getImm() : 0;
18750 if (!TFEVal && !LWEVal)
18761 assert(MO_Dmask &&
"Expected dmask operand in instruction");
18763 unsigned dmask = MO_Dmask->
getImm();
18768 bool Packed = !Subtarget->hasUnpackedD16VMem();
18770 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18777 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
18778 if (DstSize < InitIdx)
18782 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
18791 unsigned NewDst = 0;
18796 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18797 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18800 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18821 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
18833 if (
TII->isVOP3(
MI.getOpcode())) {
18835 TII->legalizeOperandsVOP3(MRI,
MI);
18837 if (
TII->isMAI(
MI)) {
18842 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18843 AMDGPU::OpName::scale_src0);
18844 if (Src0Idx != -1) {
18845 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18846 AMDGPU::OpName::scale_src1);
18847 if (
TII->usesConstantBus(MRI,
MI, Src0Idx) &&
18848 TII->usesConstantBus(MRI,
MI, Src1Idx))
18849 TII->legalizeOpWithMove(
MI, Src1Idx);
18856 if (
TII->isImage(
MI))
18857 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
18931std::pair<unsigned, const TargetRegisterClass *>
18938 if (Constraint.
size() == 1) {
18942 if (VT == MVT::Other)
18945 switch (Constraint[0]) {
18952 RC = &AMDGPU::SReg_32RegClass;
18955 RC = &AMDGPU::SGPR_64RegClass;
18960 return std::pair(0U,
nullptr);
18967 return std::pair(0U,
nullptr);
18969 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18970 : &AMDGPU::VGPR_32_Lo256RegClass;
18973 RC = Subtarget->has1024AddressableVGPRs()
18974 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
18977 return std::pair(0U,
nullptr);
18982 if (!Subtarget->hasMAIInsts())
18986 return std::pair(0U,
nullptr);
18988 RC = &AMDGPU::AGPR_32RegClass;
18993 return std::pair(0U,
nullptr);
18998 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
19002 RC = &AMDGPU::AV_32RegClass;
19005 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
19007 return std::pair(0U,
nullptr);
19016 return std::pair(0U, RC);
19019 if (Kind !=
'\0') {
19021 RC = &AMDGPU::VGPR_32_Lo256RegClass;
19022 }
else if (Kind ==
's') {
19023 RC = &AMDGPU::SGPR_32RegClass;
19024 }
else if (Kind ==
'a') {
19025 RC = &AMDGPU::AGPR_32RegClass;
19031 return std::pair(0U,
nullptr);
19037 return std::pair(0U,
nullptr);
19041 RC =
TRI->getVGPRClassForBitWidth(Width);
19043 RC =
TRI->getSGPRClassForBitWidth(Width);
19045 RC =
TRI->getAGPRClassForBitWidth(Width);
19047 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
19052 return std::pair(0U,
nullptr);
19054 return std::pair(Reg, RC);
19060 return std::pair(0U,
nullptr);
19061 if (RC && Idx < RC->getNumRegs())
19063 return std::pair(0U,
nullptr);
19069 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
19075 if (Constraint.
size() == 1) {
19076 switch (Constraint[0]) {
19086 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
19094 if (Constraint.
size() == 1) {
19095 switch (Constraint[0]) {
19103 }
else if (Constraint.
size() == 2) {
19104 if (Constraint ==
"VA")
19122 std::vector<SDValue> &
Ops,
19137 unsigned Size =
Op.getScalarValueSizeInBits();
19141 if (
Size == 16 && !Subtarget->has16BitInsts())
19145 Val =
C->getSExtValue();
19149 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
19153 if (
Size != 16 ||
Op.getNumOperands() != 2)
19155 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
19158 Val =
C->getSExtValue();
19162 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
19172 if (Constraint.
size() == 1) {
19173 switch (Constraint[0]) {
19188 }
else if (Constraint.
size() == 2) {
19189 if (Constraint ==
"DA") {
19190 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
19191 int64_t LoBits =
static_cast<int32_t
>(Val);
19195 if (Constraint ==
"DB") {
19203 unsigned MaxSize)
const {
19204 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
19205 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
19207 MVT VT =
Op.getSimpleValueType();
19232 switch (UnalignedClassID) {
19233 case AMDGPU::VReg_64RegClassID:
19234 return AMDGPU::VReg_64_Align2RegClassID;
19235 case AMDGPU::VReg_96RegClassID:
19236 return AMDGPU::VReg_96_Align2RegClassID;
19237 case AMDGPU::VReg_128RegClassID:
19238 return AMDGPU::VReg_128_Align2RegClassID;
19239 case AMDGPU::VReg_160RegClassID:
19240 return AMDGPU::VReg_160_Align2RegClassID;
19241 case AMDGPU::VReg_192RegClassID:
19242 return AMDGPU::VReg_192_Align2RegClassID;
19243 case AMDGPU::VReg_224RegClassID:
19244 return AMDGPU::VReg_224_Align2RegClassID;
19245 case AMDGPU::VReg_256RegClassID:
19246 return AMDGPU::VReg_256_Align2RegClassID;
19247 case AMDGPU::VReg_288RegClassID:
19248 return AMDGPU::VReg_288_Align2RegClassID;
19249 case AMDGPU::VReg_320RegClassID:
19250 return AMDGPU::VReg_320_Align2RegClassID;
19251 case AMDGPU::VReg_352RegClassID:
19252 return AMDGPU::VReg_352_Align2RegClassID;
19253 case AMDGPU::VReg_384RegClassID:
19254 return AMDGPU::VReg_384_Align2RegClassID;
19255 case AMDGPU::VReg_512RegClassID:
19256 return AMDGPU::VReg_512_Align2RegClassID;
19257 case AMDGPU::VReg_1024RegClassID:
19258 return AMDGPU::VReg_1024_Align2RegClassID;
19259 case AMDGPU::AReg_64RegClassID:
19260 return AMDGPU::AReg_64_Align2RegClassID;
19261 case AMDGPU::AReg_96RegClassID:
19262 return AMDGPU::AReg_96_Align2RegClassID;
19263 case AMDGPU::AReg_128RegClassID:
19264 return AMDGPU::AReg_128_Align2RegClassID;
19265 case AMDGPU::AReg_160RegClassID:
19266 return AMDGPU::AReg_160_Align2RegClassID;
19267 case AMDGPU::AReg_192RegClassID:
19268 return AMDGPU::AReg_192_Align2RegClassID;
19269 case AMDGPU::AReg_256RegClassID:
19270 return AMDGPU::AReg_256_Align2RegClassID;
19271 case AMDGPU::AReg_512RegClassID:
19272 return AMDGPU::AReg_512_Align2RegClassID;
19273 case AMDGPU::AReg_1024RegClassID:
19274 return AMDGPU::AReg_1024_Align2RegClassID;
19290 if (Info->isEntryFunction()) {
19297 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
19299 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
19300 :
TRI->getAlignedHighSGPRForRC(MF, 2,
19301 &AMDGPU::SGPR_64RegClass);
19302 Info->setSGPRForEXECCopy(SReg);
19304 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
19305 Info->getStackPtrOffsetReg()));
19306 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
19307 MRI.
replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
19311 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
19312 MRI.
replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
19314 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
19317 Info->limitOccupancy(MF);
19319 if (ST.isWave32() && !MF.
empty()) {
19320 for (
auto &
MBB : MF) {
19321 for (
auto &
MI :
MBB) {
19322 TII->fixImplicitOperands(
MI);
19332 if (ST.needsAlignedVGPRs()) {
19339 if (NewClassID != -1)
19349 const APInt &DemandedElts,
19351 unsigned Depth)
const {
19353 unsigned Opc =
Op.getOpcode();
19356 unsigned IID =
Op.getConstantOperandVal(0);
19358 case Intrinsic::amdgcn_mbcnt_lo:
19359 case Intrinsic::amdgcn_mbcnt_hi: {
19365 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
19375 Op, Known, DemandedElts, DAG,
Depth);
19391 unsigned MaxValue =
19398 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
19402 unsigned Src1Cst = 0;
19403 if (Src1.
isImm()) {
19404 Src1Cst = Src1.
getImm();
19405 }
else if (Src1.
isReg()) {
19409 Src1Cst = Cst->Value.getZExtValue();
19420 if (Width >= BFEWidth)
19429 Known = Known.
sext(BFEWidth);
19431 Known = Known.
zext(BFEWidth);
19437 unsigned Depth)
const {
19440 switch (
MI->getOpcode()) {
19441 case AMDGPU::S_BFE_I32:
19444 case AMDGPU::S_BFE_U32:
19447 case AMDGPU::S_BFE_I64:
19450 case AMDGPU::S_BFE_U64:
19453 case AMDGPU::G_INTRINSIC:
19454 case AMDGPU::G_INTRINSIC_CONVERGENT: {
19457 case Intrinsic::amdgcn_workitem_id_x:
19460 case Intrinsic::amdgcn_workitem_id_y:
19463 case Intrinsic::amdgcn_workitem_id_z:
19466 case Intrinsic::amdgcn_mbcnt_lo:
19467 case Intrinsic::amdgcn_mbcnt_hi: {
19479 case Intrinsic::amdgcn_groupstaticsize: {
19490 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
19493 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
19496 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
19501 case AMDGPU::G_AMDGPU_SMED3:
19502 case AMDGPU::G_AMDGPU_UMED3: {
19503 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
19530 unsigned Depth)
const {
19537 AttributeList Attrs =
19539 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
19557 if (Header->getAlignment() != PrefAlign)
19558 return Header->getAlignment();
19559 if (needsFetchWindowAlignment(*Header))
19580 if (Header->getAlignment() != PrefAlign)
19581 return Header->getAlignment();
19583 unsigned LoopSize = 0;
19588 LoopSize +=
MBB->getAlignment().value() / 2;
19591 LoopSize +=
TII->getInstSizeInBytes(
MI);
19592 if (LoopSize > 192)
19597 if (LoopSize <= 64)
19600 if (LoopSize <= 128)
19601 return CacheLineAlign;
19607 auto I = Exit->getFirstNonDebugInstr();
19608 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19609 return CacheLineAlign;
19618 if (PreTerm == Pre->
begin() ||
19619 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19623 auto ExitHead = Exit->getFirstNonDebugInstr();
19624 if (ExitHead == Exit->end() ||
19625 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19630 return CacheLineAlign;
19638 if (needsFetchWindowAlignment(*
MBB))
19643bool SITargetLowering::needsFetchWindowAlignment(
19645 if (!
getSubtarget()->hasLoopHeadInstSplitSensitivity())
19649 if (
MI.isMetaInstruction())
19652 return TII->getInstSizeInBytes(
MI) > 4;
19662 N =
N->getOperand(0).getNode();
19672 switch (
N->getOpcode()) {
19680 if (Reg.isPhysical() || MRI.
isLiveIn(Reg))
19681 return !
TRI->isSGPRReg(MRI, Reg);
19687 return !
TRI->isSGPRReg(MRI, Reg);
19691 unsigned AS = L->getAddressSpace();
19701 case AMDGPUISD::ATOMIC_CMP_SWAP:
19702 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
19703 case AMDGPUISD::BUFFER_ATOMIC_ADD:
19704 case AMDGPUISD::BUFFER_ATOMIC_SUB:
19705 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
19706 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
19707 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
19708 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
19709 case AMDGPUISD::BUFFER_ATOMIC_AND:
19710 case AMDGPUISD::BUFFER_ATOMIC_OR:
19711 case AMDGPUISD::BUFFER_ATOMIC_XOR:
19712 case AMDGPUISD::BUFFER_ATOMIC_INC:
19713 case AMDGPUISD::BUFFER_ATOMIC_DEC:
19714 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
19715 case AMDGPUISD::BUFFER_ATOMIC_FADD:
19716 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
19717 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
19723 return A->readMem() &&
A->writeMem();
19744 switch (Ty.getScalarSizeInBits()) {
19756 const APInt &DemandedElts,
19759 unsigned Depth)
const {
19760 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
19764 if (Info->getMode().DX10Clamp)
19776 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
19796 <<
"Hardware instruction generated for atomic "
19798 <<
" operation at memory scope " << MemScope;
19803 Type *EltTy = VT->getElementType();
19804 return VT->getNumElements() == 2 &&
19824 unsigned BW =
IT->getBitWidth();
19825 return BW == 32 || BW == 64;
19839 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
19840 return BW == 32 || BW == 64;
19843 if (Ty->isFloatTy() || Ty->isDoubleTy())
19847 return VT->getNumElements() == 2 &&
19848 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19858 bool HasSystemScope) {
19865 if (HasSystemScope) {
19866 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19869 if (Subtarget.hasEmulatedSystemScopeAtomics())
19871 }
else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19874 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
19887 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
19895 return STI.hasGloballyAddressableScratch()
19913 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
19926 bool HasSystemScope =
19958 if (!
IT ||
IT->getBitWidth() != 32)
19964 if (Subtarget->hasEmulatedSystemScopeAtomics())
19980 if (!HasSystemScope &&
19981 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19993 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
20001 ConstVal && ConstVal->isNullValue())
20039 if (Ty->isFloatTy()) {
20044 if (Ty->isDoubleTy()) {
20065 if (Ty->isFloatTy() &&
20066 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
20079 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
20083 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
20087 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
20092 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
20097 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
20101 if (Ty->isFloatTy()) {
20104 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20107 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20112 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
20120 if (Subtarget->hasFlatAtomicFaddF32Inst())
20129 if (Subtarget->hasLDSFPAtomicAddF32()) {
20130 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20132 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20160 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
20162 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
20166 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
20168 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
20222 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
20223 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
20224 : &AMDGPU::SReg_32RegClass;
20225 if (!
TRI->isSGPRClass(RC) && !isDivergent)
20226 return TRI->getEquivalentSGPRClass(RC);
20227 if (
TRI->isSGPRClass(RC) && isDivergent) {
20228 if (Subtarget->hasGFX90AInsts())
20229 return TRI->getEquivalentAVClass(RC);
20230 return TRI->getEquivalentVGPRClass(RC);
20243 unsigned WaveSize) {
20248 if (!
IT ||
IT->getBitWidth() != WaveSize)
20253 if (!Visited.
insert(V).second)
20255 bool Result =
false;
20256 for (
const auto *U : V->users()) {
20258 if (V == U->getOperand(1)) {
20263 case Intrinsic::amdgcn_if_break:
20264 case Intrinsic::amdgcn_if:
20265 case Intrinsic::amdgcn_else:
20270 if (V == U->getOperand(0)) {
20275 case Intrinsic::amdgcn_end_cf:
20276 case Intrinsic::amdgcn_loop:
20282 Result =
hasCFUser(U, Visited, WaveSize);
20291 const Value *V)
const {
20293 if (CI->isInlineAsm()) {
20302 for (
auto &TC : TargetConstraints) {
20316 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
20351 if (
I.getMetadata(
"amdgpu.noclobber"))
20353 if (
I.getMetadata(
"amdgpu.last.use"))
20417 Alignment = RMW->getAlign();
20430 bool FullFlatEmulation =
20432 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
20433 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
20434 RMW->getType()->isDoubleTy()));
20437 bool ReturnValueIsUsed = !AI->
use_empty();
20446 if (FullFlatEmulation) {
20457 std::prev(BB->
end())->eraseFromParent();
20458 Builder.SetInsertPoint(BB);
20460 Value *LoadedShared =
nullptr;
20461 if (FullFlatEmulation) {
20462 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
20463 {Addr},
nullptr,
"is.shared");
20464 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
20465 Builder.SetInsertPoint(SharedBB);
20466 Value *CastToLocal = Builder.CreateAddrSpaceCast(
20472 LoadedShared = Clone;
20474 Builder.CreateBr(PhiBB);
20475 Builder.SetInsertPoint(CheckPrivateBB);
20478 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
20479 {Addr},
nullptr,
"is.private");
20480 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
20482 Builder.SetInsertPoint(PrivateBB);
20484 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
20487 Value *LoadedPrivate;
20489 LoadedPrivate = Builder.CreateAlignedLoad(
20490 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
20493 LoadedPrivate, RMW->getValOperand());
20495 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
20497 auto [ResultLoad, Equal] =
20503 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
20506 Builder.CreateBr(PhiBB);
20508 Builder.SetInsertPoint(GlobalBB);
20512 if (FullFlatEmulation) {
20513 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
20522 if (!FullFlatEmulation) {
20527 MDNode *RangeNotPrivate =
20530 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
20534 Builder.CreateBr(PhiBB);
20536 Builder.SetInsertPoint(PhiBB);
20538 if (ReturnValueIsUsed) {
20541 if (FullFlatEmulation)
20542 Loaded->addIncoming(LoadedShared, SharedBB);
20543 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20544 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20545 Loaded->takeName(AI);
20548 Builder.CreateBr(ExitBB);
20552 unsigned PtrOpIdx) {
20553 Value *PtrOp =
I->getOperand(PtrOpIdx);
20560 I->setOperand(PtrOpIdx, ASCast);
20572 ConstVal && ConstVal->isNullValue()) {
20602 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20610 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20625 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
static bool isCtlzOpc(unsigned Opc)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static MachineBasicBlock * expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static std::tuple< unsigned, unsigned > getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static std::pair< Register, Register > ExtractSubRegs(MachineInstr &MI, MachineOperand &Op, const TargetRegisterClass *SrcRC, const GCNSubtarget &ST, MachineRegisterInfo &MRI)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC, const SDValue LHS, const SDValue RHS, const SelectionDAG &DAG)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
static uint64_t getIdentityValueForWaveReduction(unsigned Opc)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
void setUsesDynamicLDS(bool DynLDS)
bool isBottomOfStack() const
uint32_t getLDSSize() const
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
const SIInstrInfo * getInstrInfo() const override
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...