44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
56#define DEBUG_TYPE "si-lower"
62 cl::desc(
"Do not align and prefetch loops"),
66 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
67 cl::desc(
"Use indirect register addressing for divergent indexes"),
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
84 return AMDGPU::SGPR0 +
Reg;
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
368 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
382 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
396 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
410 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
424 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
439 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
443 if (Subtarget->hasPkMovB32()) {
464 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
470 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
474 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
501 if (Subtarget->hasSMemRealTime() ||
506 if (Subtarget->has16BitInsts()) {
513 if (Subtarget->hasMadMacF32Insts())
530 if (Subtarget->hasIntClamp())
533 if (Subtarget->hasAddNoCarry())
539 {MVT::f32, MVT::f64},
Custom);
545 {MVT::f32, MVT::f64},
Legal);
547 if (Subtarget->haveRoundOpsF64())
570 if (Subtarget->has16BitInsts()) {
623 if (Subtarget->hasBF16TransInsts())
642 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
643 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
644 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
806 if (Subtarget->hasVOP3PInsts()) {
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
825 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
833 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
840 {MVT::v2f16, MVT::v4f16},
Custom);
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
853 if (Subtarget->hasPackedFP32Ops()) {
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
864 if (Subtarget->has16BitInsts()) {
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
890 if (Subtarget->hasMad64_32())
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
901 if (Subtarget->hasMinimum3Maximum3F32())
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
908 if (!Subtarget->hasMinimum3Maximum3F16())
913 if (Subtarget->hasVOP3PInsts()) {
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
920 if (Subtarget->hasIntMinMax64())
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 if (Subtarget->hasBF16ConversionInsts()) {
961 if (Subtarget->hasBF16PackedInsts()) {
967 if (Subtarget->hasBF16TransInsts()) {
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1064 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1077 EVT DestVT,
EVT SrcVT)
const {
1079 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1080 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1082 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1089 LLT DestTy,
LLT SrcTy)
const {
1090 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1091 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1093 SrcTy.getScalarSizeInBits() == 16 &&
1114 if (Subtarget->has16BitInsts()) {
1117 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1119 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1123 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1145 if (
Size == 16 && Subtarget->has16BitInsts())
1146 return (NumElts + 1) / 2;
1152 return NumElts * ((
Size + 31) / 32);
1161 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1169 if (
Size == 16 && Subtarget->has16BitInsts()) {
1170 if (ScalarVT == MVT::bf16) {
1171 RegisterVT = MVT::i32;
1172 IntermediateVT = MVT::v2bf16;
1174 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1175 IntermediateVT = RegisterVT;
1177 NumIntermediates = (NumElts + 1) / 2;
1178 return NumIntermediates;
1183 IntermediateVT = RegisterVT;
1184 NumIntermediates = NumElts;
1185 return NumIntermediates;
1190 RegisterVT = MVT::i16;
1191 IntermediateVT = ScalarVT;
1192 NumIntermediates = NumElts;
1193 return NumIntermediates;
1197 RegisterVT = MVT::i32;
1198 IntermediateVT = ScalarVT;
1199 NumIntermediates = NumElts;
1200 return NumIntermediates;
1204 RegisterVT = MVT::i32;
1205 IntermediateVT = RegisterVT;
1206 NumIntermediates = NumElts * ((
Size + 31) / 32);
1207 return NumIntermediates;
1212 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1217 unsigned MaxNumLanes) {
1218 assert(MaxNumLanes != 0);
1222 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1233 unsigned MaxNumLanes) {
1239 assert(ST->getNumContainedTypes() == 2 &&
1240 ST->getContainedType(1)->isIntegerTy(32));
1254 return MVT::amdgpuBufferFatPointer;
1256 DL.getPointerSizeInBits(AS) == 192)
1257 return MVT::amdgpuBufferStridedPointer;
1266 DL.getPointerSizeInBits(AS) == 160) ||
1268 DL.getPointerSizeInBits(AS) == 192))
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1279 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1281 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1282 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1283 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1285 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1287 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1288 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1289 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1291 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1293 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1294 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1295 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1334 unsigned IntrID)
const {
1336 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1354 if (RsrcIntr->IsImage) {
1369 Info.ptrVal = RsrcArg;
1372 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1381 if (RsrcIntr->IsImage) {
1382 unsigned MaxNumLanes = 4;
1397 std::numeric_limits<unsigned>::max());
1407 if (RsrcIntr->IsImage) {
1428 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1430 Info.memVT = MVT::i32;
1437 case Intrinsic::amdgcn_raw_buffer_load_lds:
1438 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_buffer_load_lds:
1440 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1446 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1447 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1449 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1452 std::numeric_limits<unsigned>::max());
1462 case Intrinsic::amdgcn_ds_ordered_add:
1463 case Intrinsic::amdgcn_ds_ordered_swap: {
1476 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1477 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1480 Info.ptrVal =
nullptr;
1485 case Intrinsic::amdgcn_ds_append:
1486 case Intrinsic::amdgcn_ds_consume: {
1499 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1500 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1501 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1506 Info.memVT = MVT::i64;
1512 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1514 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1517 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1520 ->getElementType(0));
1528 case Intrinsic::amdgcn_global_atomic_fmin_num:
1529 case Intrinsic::amdgcn_global_atomic_fmax_num:
1530 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1531 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1532 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1542 case Intrinsic::amdgcn_flat_load_monitor_b32:
1543 case Intrinsic::amdgcn_flat_load_monitor_b64:
1544 case Intrinsic::amdgcn_flat_load_monitor_b128:
1545 case Intrinsic::amdgcn_global_load_monitor_b32:
1546 case Intrinsic::amdgcn_global_load_monitor_b64:
1547 case Intrinsic::amdgcn_global_load_monitor_b128:
1548 case Intrinsic::amdgcn_cluster_load_b32:
1549 case Intrinsic::amdgcn_cluster_load_b64:
1550 case Intrinsic::amdgcn_cluster_load_b128:
1551 case Intrinsic::amdgcn_ds_load_tr6_b96:
1552 case Intrinsic::amdgcn_ds_load_tr4_b64:
1553 case Intrinsic::amdgcn_ds_load_tr8_b64:
1554 case Intrinsic::amdgcn_ds_load_tr16_b128:
1555 case Intrinsic::amdgcn_global_load_tr6_b96:
1556 case Intrinsic::amdgcn_global_load_tr4_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b64:
1558 case Intrinsic::amdgcn_global_load_tr_b128:
1559 case Intrinsic::amdgcn_ds_read_tr4_b64:
1560 case Intrinsic::amdgcn_ds_read_tr6_b96:
1561 case Intrinsic::amdgcn_ds_read_tr8_b64:
1562 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1570 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1580 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1582 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1590 case Intrinsic::amdgcn_ds_gws_init:
1591 case Intrinsic::amdgcn_ds_gws_barrier:
1592 case Intrinsic::amdgcn_ds_gws_sema_v:
1593 case Intrinsic::amdgcn_ds_gws_sema_br:
1594 case Intrinsic::amdgcn_ds_gws_sema_p:
1595 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.memVT = MVT::i32;
1607 Info.align =
Align(4);
1609 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1618 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1622 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1632 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1639 case Intrinsic::amdgcn_load_to_lds:
1640 case Intrinsic::amdgcn_global_load_lds: {
1651 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1652 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1653 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1654 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1664 Info.memVT = MVT::i32;
1666 Info.align =
Align(4);
1671 case Intrinsic::amdgcn_s_prefetch_data:
1672 case Intrinsic::amdgcn_flat_prefetch:
1673 case Intrinsic::amdgcn_global_prefetch: {
1688 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1691 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1692 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1704 Type *&AccessTy)
const {
1705 Value *Ptr =
nullptr;
1706 switch (
II->getIntrinsicID()) {
1707 case Intrinsic::amdgcn_cluster_load_b128:
1708 case Intrinsic::amdgcn_cluster_load_b64:
1709 case Intrinsic::amdgcn_cluster_load_b32:
1710 case Intrinsic::amdgcn_ds_append:
1711 case Intrinsic::amdgcn_ds_consume:
1712 case Intrinsic::amdgcn_ds_load_tr8_b64:
1713 case Intrinsic::amdgcn_ds_load_tr16_b128:
1714 case Intrinsic::amdgcn_ds_load_tr4_b64:
1715 case Intrinsic::amdgcn_ds_load_tr6_b96:
1716 case Intrinsic::amdgcn_ds_read_tr4_b64:
1717 case Intrinsic::amdgcn_ds_read_tr6_b96:
1718 case Intrinsic::amdgcn_ds_read_tr8_b64:
1719 case Intrinsic::amdgcn_ds_read_tr16_b64:
1720 case Intrinsic::amdgcn_ds_ordered_add:
1721 case Intrinsic::amdgcn_ds_ordered_swap:
1722 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1723 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1724 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1725 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1726 case Intrinsic::amdgcn_flat_load_monitor_b128:
1727 case Intrinsic::amdgcn_flat_load_monitor_b32:
1728 case Intrinsic::amdgcn_flat_load_monitor_b64:
1729 case Intrinsic::amdgcn_global_atomic_fmax_num:
1730 case Intrinsic::amdgcn_global_atomic_fmin_num:
1731 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1732 case Intrinsic::amdgcn_global_load_monitor_b128:
1733 case Intrinsic::amdgcn_global_load_monitor_b32:
1734 case Intrinsic::amdgcn_global_load_monitor_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b64:
1736 case Intrinsic::amdgcn_global_load_tr_b128:
1737 case Intrinsic::amdgcn_global_load_tr4_b64:
1738 case Intrinsic::amdgcn_global_load_tr6_b96:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1742 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1743 Ptr =
II->getArgOperand(0);
1745 case Intrinsic::amdgcn_load_to_lds:
1746 case Intrinsic::amdgcn_global_load_lds:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1750 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1754 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1755 Ptr =
II->getArgOperand(1);
1760 AccessTy =
II->getType();
1766 unsigned AddrSpace)
const {
1767 if (!Subtarget->hasFlatInstOffsets()) {
1778 return AM.
Scale == 0 &&
1779 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1780 AM.
BaseOffs, AddrSpace, FlatVariant));
1784 if (Subtarget->hasFlatGlobalInsts())
1787 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1800 return isLegalMUBUFAddressingMode(AM);
1803bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1814 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1826 if (AM.HasBaseReg) {
1858 return isLegalMUBUFAddressingMode(AM);
1860 if (!Subtarget->hasScalarSubwordLoads()) {
1865 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1913 return Subtarget->enableFlatScratch()
1915 : isLegalMUBUFAddressingMode(AM);
1962 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1971 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1974 Align RequiredAlignment(
1976 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1977 Alignment < RequiredAlignment)
1992 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
1998 RequiredAlignment =
Align(4);
2000 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2016 *IsFast = (Alignment >= RequiredAlignment) ? 64
2017 : (Alignment <
Align(4)) ? 32
2024 if (!Subtarget->hasDS96AndDS128())
2030 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2039 *IsFast = (Alignment >= RequiredAlignment) ? 96
2040 : (Alignment <
Align(4)) ? 32
2047 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2053 RequiredAlignment =
Align(8);
2055 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2064 *IsFast = (Alignment >= RequiredAlignment) ? 128
2065 : (Alignment <
Align(4)) ? 32
2082 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2084 return Alignment >= RequiredAlignment ||
2085 Subtarget->hasUnalignedDSAccessEnabled();
2093 bool AlignedBy4 = Alignment >=
Align(4);
2094 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2096 *IsFast = AlignedBy4 ?
Size : 1;
2101 *IsFast = AlignedBy4;
2112 return Alignment >=
Align(4) ||
2113 Subtarget->hasUnalignedBufferAccessEnabled();
2125 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2140 return Size >= 32 && Alignment >=
Align(4);
2145 unsigned *IsFast)
const {
2147 Alignment, Flags, IsFast);
2152 const AttributeList &FuncAttributes)
const {
2158 if (
Op.size() >= 16 &&
2162 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2180 unsigned DestAS)
const {
2183 Subtarget->hasGloballyAddressableScratch()) {
2213 unsigned Index)
const {
2229 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2264 auto [InputPtrReg, RC, ArgTy] =
2274 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2280 const SDLoc &SL)
const {
2287 const SDLoc &SL)
const {
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2327SDValue SITargetLowering::lowerKernargMemParameter(
2332 MachinePointerInfo PtrInfo =
2341 int64_t OffsetDiff =
Offset - AlignDownOffset;
2347 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2358 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2363 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2368 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2377 const SDLoc &SL)
const {
2446 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2449 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2450 if (ConvertedVal == ArgValue)
2451 return ConvertedVal;
2456SDValue SITargetLowering::lowerWorkGroupId(
2461 if (!Subtarget->hasClusters())
2462 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2470 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2471 SDLoc SL(ClusterIdXYZ);
2472 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2475 SDValue ClusterWorkGroupIdXYZ =
2476 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2486 return ClusterIdXYZ;
2488 using namespace AMDGPU::Hwreg;
2492 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2503SDValue SITargetLowering::getPreloadedValue(
2506 const ArgDescriptor *
Reg =
nullptr;
2507 const TargetRegisterClass *RC;
2511 const ArgDescriptor WorkGroupIDX =
2519 const ArgDescriptor WorkGroupIDZ =
2521 const ArgDescriptor ClusterWorkGroupIDX =
2523 const ArgDescriptor ClusterWorkGroupIDY =
2525 const ArgDescriptor ClusterWorkGroupIDZ =
2527 const ArgDescriptor ClusterWorkGroupMaxIDX =
2529 const ArgDescriptor ClusterWorkGroupMaxIDY =
2531 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2533 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2536 auto LoadConstant = [&](
unsigned N) {
2540 if (Subtarget->hasArchitectedSGPRs() &&
2547 Reg = &WorkGroupIDX;
2548 RC = &AMDGPU::SReg_32RegClass;
2552 Reg = &WorkGroupIDY;
2553 RC = &AMDGPU::SReg_32RegClass;
2557 Reg = &WorkGroupIDZ;
2558 RC = &AMDGPU::SReg_32RegClass;
2562 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2563 return LoadConstant(0);
2564 Reg = &ClusterWorkGroupIDX;
2565 RC = &AMDGPU::SReg_32RegClass;
2569 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2570 return LoadConstant(0);
2571 Reg = &ClusterWorkGroupIDY;
2572 RC = &AMDGPU::SReg_32RegClass;
2576 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2577 return LoadConstant(0);
2578 Reg = &ClusterWorkGroupIDZ;
2579 RC = &AMDGPU::SReg_32RegClass;
2584 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2585 Reg = &ClusterWorkGroupMaxIDX;
2586 RC = &AMDGPU::SReg_32RegClass;
2591 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2592 Reg = &ClusterWorkGroupMaxIDY;
2593 RC = &AMDGPU::SReg_32RegClass;
2598 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2599 Reg = &ClusterWorkGroupMaxIDZ;
2600 RC = &AMDGPU::SReg_32RegClass;
2604 Reg = &ClusterWorkGroupMaxFlatID;
2605 RC = &AMDGPU::SReg_32RegClass;
2636 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2640 "vector type argument should have been split");
2645 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2653 "unexpected vector split in ps argument type");
2667 Info->markPSInputAllocated(PSInputNum);
2669 Info->markPSInputEnabled(PSInputNum);
2685 if (Info.hasWorkItemIDX()) {
2691 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2695 if (Info.hasWorkItemIDY()) {
2696 assert(Info.hasWorkItemIDX());
2697 if (Subtarget->hasPackedTID()) {
2698 Info.setWorkItemIDY(
2701 unsigned Reg = AMDGPU::VGPR1;
2709 if (Info.hasWorkItemIDZ()) {
2710 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2711 if (Subtarget->hasPackedTID()) {
2712 Info.setWorkItemIDZ(
2715 unsigned Reg = AMDGPU::VGPR2;
2735 if (RegIdx == ArgVGPRs.
size()) {
2742 unsigned Reg = ArgVGPRs[RegIdx];
2754 unsigned NumArgRegs) {
2757 if (RegIdx == ArgSGPRs.
size())
2760 unsigned Reg = ArgSGPRs[RegIdx];
2802 const unsigned Mask = 0x3ff;
2805 if (Info.hasWorkItemIDX()) {
2807 Info.setWorkItemIDX(Arg);
2810 if (Info.hasWorkItemIDY()) {
2812 Info.setWorkItemIDY(Arg);
2815 if (Info.hasWorkItemIDZ())
2827 const unsigned Mask = 0x3ff;
2836 auto &
ArgInfo = Info.getArgInfo();
2848 if (Info.hasImplicitArgPtr())
2856 if (Info.hasWorkGroupIDX())
2859 if (Info.hasWorkGroupIDY())
2862 if (Info.hasWorkGroupIDZ())
2865 if (Info.hasLDSKernelId())
2876 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2877 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2883 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2884 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2889 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2890 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2896 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2902 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2911 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2916 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2917 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2922 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2923 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2938 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2940 bool InPreloadSequence =
true;
2942 bool AlignedForImplictArgs =
false;
2943 unsigned ImplicitArgOffset = 0;
2944 for (
auto &Arg :
F.args()) {
2945 if (!InPreloadSequence || !Arg.hasInRegAttr())
2948 unsigned ArgIdx = Arg.getArgNo();
2951 if (InIdx < Ins.
size() &&
2952 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2955 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
2956 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2958 assert(ArgLocs[ArgIdx].isMemLoc());
2959 auto &ArgLoc = ArgLocs[InIdx];
2961 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2963 unsigned NumAllocSGPRs =
2964 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2967 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2968 if (!AlignedForImplictArgs) {
2970 alignTo(LastExplicitArgOffset,
2971 Subtarget->getAlignmentForImplicitArgPtr()) -
2972 LastExplicitArgOffset;
2973 AlignedForImplictArgs =
true;
2975 ArgOffset += ImplicitArgOffset;
2979 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2980 assert(InIdx >= 1 &&
"No previous SGPR");
2981 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2982 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2986 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2987 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2990 InPreloadSequence =
false;
2996 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2998 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3000 if (PreloadRegs->
size() > 1)
3001 RC = &AMDGPU::SGPR_32RegClass;
3002 for (
auto &Reg : *PreloadRegs) {
3008 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3017 if (Info.hasLDSKernelId()) {
3018 Register Reg = Info.addLDSKernelId();
3019 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3028 bool IsShader)
const {
3029 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3030 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3036 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3038 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3042 unsigned NumRequiredSystemSGPRs =
3043 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3044 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3045 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3046 Register Reg = Info.addReservedUserSGPR();
3047 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3052 if (!HasArchitectedSGPRs) {
3053 if (Info.hasWorkGroupIDX()) {
3054 Register Reg = Info.addWorkGroupIDX();
3055 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3059 if (Info.hasWorkGroupIDY()) {
3060 Register Reg = Info.addWorkGroupIDY();
3061 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 if (Info.hasWorkGroupIDZ()) {
3066 Register Reg = Info.addWorkGroupIDZ();
3067 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 if (Info.hasWorkGroupInfo()) {
3073 Register Reg = Info.addWorkGroupInfo();
3074 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3078 if (Info.hasPrivateSegmentWaveByteOffset()) {
3080 unsigned PrivateSegmentWaveByteOffsetReg;
3083 PrivateSegmentWaveByteOffsetReg =
3084 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3088 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3090 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3093 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3095 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3096 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3099 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3100 Info.getNumPreloadedSGPRs() >= 16);
3115 if (HasStackObjects)
3116 Info.setHasNonSpillStackObjects(
true);
3121 HasStackObjects =
true;
3125 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3127 if (!ST.enableFlatScratch()) {
3128 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3135 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3137 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3147 Info.setScratchRSrcReg(ReservedBufferReg);
3166 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3167 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3174 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3175 if (!
MRI.isLiveIn(
Reg)) {
3176 Info.setStackPtrOffsetReg(
Reg);
3181 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3188 if (ST.getFrameLowering()->hasFP(MF)) {
3189 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3205 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3214 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3215 RC = &AMDGPU::SGPR_64RegClass;
3216 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3217 RC = &AMDGPU::SGPR_32RegClass;
3223 Entry->addLiveIn(*
I);
3228 for (
auto *Exit : Exits)
3230 TII->get(TargetOpcode::COPY), *
I)
3245 bool IsError =
false;
3249 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3267 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3268 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3270 if (!Subtarget->enableFlatScratch())
3275 !Subtarget->hasArchitectedSGPRs())
3276 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3277 !Info->hasWorkGroupIDZ());
3280 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3298 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3299 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3302 Info->markPSInputAllocated(0);
3303 Info->markPSInputEnabled(0);
3305 if (Subtarget->isAmdPalOS()) {
3314 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3315 if ((PsInputBits & 0x7F) == 0 ||
3316 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3319 }
else if (IsKernel) {
3320 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3332 if (IsKernel && Subtarget->hasKernargPreload())
3336 }
else if (!IsGraphics) {
3341 if (!Subtarget->enableFlatScratch())
3353 Info->setNumWaveDispatchSGPRs(
3355 Info->setNumWaveDispatchVGPRs(
3357 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3358 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3363 if (IsWholeWaveFunc) {
3365 {MVT::i1, MVT::Other}, Chain);
3377 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3388 if (IsEntryFunc && VA.
isMemLoc()) {
3411 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3415 int64_t OffsetDiff =
Offset - AlignDownOffset;
3422 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3433 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3434 Ins[i].Flags.isSExt(), &Ins[i]);
3442 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3445 if (PreloadRegs.
size() == 1) {
3446 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3451 TRI->getRegSizeInBits(*RC)));
3459 for (
auto Reg : PreloadRegs) {
3466 PreloadRegs.size()),
3483 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3484 Ins[i].Flags.isSExt(), &Ins[i]);
3496 "hidden argument in kernel signature was not preloaded",
3502 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3503 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3523 if (!IsEntryFunc && VA.
isMemLoc()) {
3524 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3535 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3536 RC = &AMDGPU::VGPR_32RegClass;
3537 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3538 RC = &AMDGPU::SGPR_32RegClass;
3558 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3567 auto &ArgUsageInfo =
3570 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3572 auto *ArgUsageInfo =
3574 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3576 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3580 Info->setBytesInStackArgArea(StackArgSize);
3582 return Chains.
empty() ? Chain
3591 const Type *RetTy)
const {
3599 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3604 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3605 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3606 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3607 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3630 Info->setIfReturnsVoid(Outs.
empty());
3631 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3650 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3651 ++
I, ++RealRVLocIdx) {
3655 SDValue Arg = OutVals[RealRVLocIdx];
3678 ReadFirstLane, Arg);
3685 if (!Info->isEntryFunction()) {
3691 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3693 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3706 unsigned Opc = AMDGPUISD::ENDPGM;
3708 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3709 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3710 : AMDGPUISD::RET_GLUE;
3792 auto &ArgUsageInfo =
3795 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3796 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3798 auto *ArgUsageInfo =
3803 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3831 const auto [OutgoingArg, ArgRC, ArgTy] =
3836 const auto [IncomingArg, IncomingArgRC, Ty] =
3838 assert(IncomingArgRC == ArgRC);
3841 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3849 InputReg = getImplicitArgPtr(DAG,
DL);
3851 std::optional<uint32_t> Id =
3853 if (Id.has_value()) {
3864 if (OutgoingArg->isRegister()) {
3865 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3866 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3869 unsigned SpecialArgOffset =
3880 auto [OutgoingArg, ArgRC, Ty] =
3883 std::tie(OutgoingArg, ArgRC, Ty) =
3886 std::tie(OutgoingArg, ArgRC, Ty) =
3901 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3902 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3903 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3908 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3916 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3926 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3935 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3936 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3947 : IncomingArgY ? *IncomingArgY
3954 if (OutgoingArg->isRegister()) {
3956 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3982 if (Callee->isDivergent())
3989 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3993 if (!CallerPreserved)
3996 bool CCMatch = CallerCC == CalleeCC;
4009 if (Arg.hasByValAttr())
4023 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4024 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4033 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4046 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4048 if (!CCVA.isRegLoc())
4053 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4055 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4079enum ChainCallArgIdx {
4101 bool UsesDynamicVGPRs =
false;
4102 if (IsChainCallConv) {
4107 auto RequestedExecIt =
4109 return Arg.OrigArgIndex == 2;
4111 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4113 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4116 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4119 "Haven't popped all the special args");
4122 CLI.
Args[ChainCallArgIdx::Exec];
4123 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4131 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4133 ChainCallSpecialArgs.
push_back(Arg.Node);
4136 PushNodeOrTargetConstant(RequestedExecArg);
4142 if (FlagsValue.
isZero()) {
4143 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4145 "no additional args allowed if flags == 0");
4147 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4151 if (!Subtarget->isWave32()) {
4153 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4156 UsesDynamicVGPRs =
true;
4157 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4158 CLI.
Args.end(), PushNodeOrTargetConstant);
4167 bool IsSibCall =
false;
4181 "unsupported call to variadic function ");
4189 "unsupported required tail call to function ");
4194 Outs, OutVals, Ins, DAG);
4198 "site marked musttail or on llvm.amdgcn.cs.chain");
4205 if (!TailCallOpt && IsTailCall)
4245 auto *
TRI = Subtarget->getRegisterInfo();
4252 if (!IsSibCall || IsChainCallConv) {
4253 if (!Subtarget->enableFlatScratch()) {
4259 RegsToPass.emplace_back(IsChainCallConv
4260 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4261 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4268 const unsigned NumSpecialInputs = RegsToPass.size();
4270 MVT PtrVT = MVT::i32;
4273 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4301 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4309 int32_t
Offset = LocMemOffset;
4316 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4322 ? Flags.getNonZeroByValAlign()
4349 if (Outs[i].Flags.isByVal()) {
4351 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4354 Outs[i].Flags.getNonZeroByValAlign(),
4356 nullptr, std::nullopt, DstInfo,
4362 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4368 if (!MemOpChains.
empty())
4384 unsigned ArgIdx = 0;
4385 for (
auto [Reg, Val] : RegsToPass) {
4386 if (ArgIdx++ >= NumSpecialInputs &&
4387 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4413 if (IsTailCall && !IsSibCall) {
4418 std::vector<SDValue>
Ops({Chain});
4424 Ops.push_back(Callee);
4441 Ops.push_back(Callee);
4452 if (IsChainCallConv)
4457 for (
auto &[Reg, Val] : RegsToPass)
4461 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4462 assert(Mask &&
"Missing call preserved mask for calling convention");
4472 MVT::Glue, GlueOps),
4477 Ops.push_back(InGlue);
4483 unsigned OPC = AMDGPUISD::TC_RETURN;
4486 OPC = AMDGPUISD::TC_RETURN_GFX;
4490 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4491 : AMDGPUISD::TC_RETURN_CHAIN;
4497 if (Info->isWholeWaveFunction())
4498 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4505 Chain =
Call.getValue(0);
4506 InGlue =
Call.getValue(1);
4508 uint64_t CalleePopBytes = NumBytes;
4529 EVT VT =
Op.getValueType();
4543 "Stack grows upwards for AMDGPU");
4545 Chain = BaseAddr.getValue(1);
4547 if (Alignment > StackAlign) {
4549 << Subtarget->getWavefrontSizeLog2();
4550 uint64_t StackAlignMask = ScaledAlignment - 1;
4557 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4563 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4574 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4590 if (
Op.getValueType() != MVT::i32)
4609 assert(
Op.getValueType() == MVT::i32);
4618 Op.getOperand(0), IntrinID, GetRoundBothImm);
4652 SDValue RoundModeTimesNumBits =
4672 TableEntry, EnumOffset);
4688 static_cast<uint32_t>(ConstMode->getZExtValue()),
4700 if (UseReducedTable) {
4706 SDValue RoundModeTimesNumBits =
4726 SDValue RoundModeTimesNumBits =
4735 NewMode = TruncTable;
4744 ReadFirstLaneID, NewMode);
4757 IntrinID, RoundBothImm, NewMode);
4763 if (
Op->isDivergent() &&
4764 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4774 if (Subtarget->hasSafeSmemPrefetch())
4782 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4791 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4792 EVT SrcVT = Src.getValueType();
4801 EVT DstVT =
Op.getValueType();
4810 if (
Op.getValueType() != MVT::i64)
4824 Op.getOperand(0), IntrinID, ModeHwRegImm);
4826 Op.getOperand(0), IntrinID, TrapHwRegImm);
4840 if (
Op.getOperand(1).getValueType() != MVT::i64)
4852 ReadFirstLaneID, NewModeReg);
4854 ReadFirstLaneID, NewTrapReg);
4856 unsigned ModeHwReg =
4859 unsigned TrapHwReg =
4867 IntrinID, ModeHwRegImm, NewModeReg);
4870 IntrinID, TrapHwRegImm, NewTrapReg);
4879 .
Case(
"m0", AMDGPU::M0)
4880 .
Case(
"exec", AMDGPU::EXEC)
4881 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4882 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4883 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4884 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4885 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4890 if (!Subtarget->hasFlatScrRegister() &&
4891 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4893 "\" for subtarget."));
4898 case AMDGPU::EXEC_LO:
4899 case AMDGPU::EXEC_HI:
4900 case AMDGPU::FLAT_SCR_LO:
4901 case AMDGPU::FLAT_SCR_HI:
4906 case AMDGPU::FLAT_SCR:
4925 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4934static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4956 auto Next = std::next(
I);
4967 MBB.addSuccessor(LoopBB);
4969 return std::pair(LoopBB, RemainderBB);
4976 auto I =
MI.getIterator();
4977 auto E = std::next(
I);
4999 Src->setIsKill(
false);
5009 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5015 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5018 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5042 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5043 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5053 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5054 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5056 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5057 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5065 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5072 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5076 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5084 MRI.setSimpleHint(NewExec, CondReg);
5086 if (UseGPRIdxMode) {
5088 SGPRIdxReg = CurrentIdxReg;
5090 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5091 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5101 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5132 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5133 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5141 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5143 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5144 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5160 InitResultReg, DstReg, PhiReg, TmpExec,
5161 Offset, UseGPRIdxMode, SGPRIdxReg);
5167 LoopBB->removeSuccessor(RemainderBB);
5169 LoopBB->addSuccessor(LandingPad);
5180static std::pair<unsigned, int>
5184 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5189 return std::pair(AMDGPU::sub0,
Offset);
5229 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5246 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5247 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5256 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5259 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5263 if (UseGPRIdxMode) {
5270 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5283 MI.eraseFromParent();
5292 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5293 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5299 UseGPRIdxMode, SGPRIdxReg);
5303 if (UseGPRIdxMode) {
5305 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5307 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5312 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5317 MI.eraseFromParent();
5334 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5344 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5346 if (Idx->
getReg() == AMDGPU::NoRegister) {
5357 MI.eraseFromParent();
5362 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5366 if (UseGPRIdxMode) {
5370 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5379 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5380 TRI.getRegSizeInBits(*VecRC), 32,
false);
5386 MI.eraseFromParent();
5396 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5400 UseGPRIdxMode, SGPRIdxReg);
5403 if (UseGPRIdxMode) {
5405 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5407 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5413 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5414 TRI.getRegSizeInBits(*VecRC), 32,
false);
5415 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5421 MI.eraseFromParent();
5437 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5438 if (ST.hasScalarAddSub64()) {
5439 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5449 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5450 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5453 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5455 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5458 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5460 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5462 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5463 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5472 MI.eraseFromParent();
5478 case AMDGPU::S_MIN_U32:
5479 return std::numeric_limits<uint32_t>::max();
5480 case AMDGPU::S_MIN_I32:
5481 return std::numeric_limits<int32_t>::max();
5482 case AMDGPU::S_MAX_U32:
5483 return std::numeric_limits<uint32_t>::min();
5484 case AMDGPU::S_MAX_I32:
5485 return std::numeric_limits<int32_t>::min();
5486 case AMDGPU::V_ADD_F32_e64:
5488 case AMDGPU::V_SUB_F32_e64:
5490 case AMDGPU::S_ADD_I32:
5491 case AMDGPU::S_SUB_I32:
5492 case AMDGPU::S_OR_B32:
5493 case AMDGPU::S_XOR_B32:
5494 return std::numeric_limits<uint32_t>::min();
5495 case AMDGPU::S_AND_B32:
5496 return std::numeric_limits<uint32_t>::max();
5497 case AMDGPU::V_MIN_F32_e64:
5498 case AMDGPU::V_MAX_F32_e64:
5502 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5508 case AMDGPU::V_CMP_LT_U64_e64:
5509 return std::numeric_limits<uint64_t>::max();
5510 case AMDGPU::V_CMP_LT_I64_e64:
5511 return std::numeric_limits<int64_t>::max();
5512 case AMDGPU::V_CMP_GT_U64_e64:
5513 return std::numeric_limits<uint64_t>::min();
5514 case AMDGPU::V_CMP_GT_I64_e64:
5515 return std::numeric_limits<int64_t>::min();
5516 case AMDGPU::S_ADD_U64_PSEUDO:
5517 case AMDGPU::S_SUB_U64_PSEUDO:
5518 case AMDGPU::S_OR_B64:
5519 case AMDGPU::S_XOR_B64:
5520 return std::numeric_limits<uint64_t>::min();
5521 case AMDGPU::S_AND_B64:
5522 return std::numeric_limits<uint64_t>::max();
5525 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5530 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5531 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5532 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5533 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5534 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5535 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5536 Opc == AMDGPU::V_SUB_F32_e64;
5540 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5541 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64;
5555 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5560 case AMDGPU::S_MIN_U32:
5561 case AMDGPU::S_MIN_I32:
5562 case AMDGPU::V_MIN_F32_e64:
5563 case AMDGPU::S_MAX_U32:
5564 case AMDGPU::S_MAX_I32:
5565 case AMDGPU::V_MAX_F32_e64:
5566 case AMDGPU::S_AND_B32:
5567 case AMDGPU::S_OR_B32: {
5573 case AMDGPU::V_CMP_LT_U64_e64:
5574 case AMDGPU::V_CMP_LT_I64_e64:
5575 case AMDGPU::V_CMP_GT_U64_e64:
5576 case AMDGPU::V_CMP_GT_I64_e64:
5577 case AMDGPU::S_AND_B64:
5578 case AMDGPU::S_OR_B64: {
5584 case AMDGPU::S_XOR_B32:
5585 case AMDGPU::S_XOR_B64:
5586 case AMDGPU::S_ADD_I32:
5587 case AMDGPU::S_ADD_U64_PSEUDO:
5588 case AMDGPU::V_ADD_F32_e64:
5589 case AMDGPU::S_SUB_I32:
5590 case AMDGPU::S_SUB_U64_PSEUDO:
5591 case AMDGPU::V_SUB_F32_e64: {
5594 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5598 bool IsWave32 = ST.isWave32();
5599 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5600 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5601 unsigned BitCountOpc =
5602 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5606 auto NewAccumulator =
5611 case AMDGPU::S_XOR_B32:
5612 case AMDGPU::S_XOR_B64: {
5618 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5621 .
addReg(NewAccumulator->getOperand(0).getReg())
5624 if (
Opc == AMDGPU::S_XOR_B32) {
5630 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5632 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5636 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5639 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5641 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5651 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5659 case AMDGPU::S_SUB_I32: {
5660 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5668 .
addReg(NewAccumulator->getOperand(0).getReg());
5671 case AMDGPU::S_ADD_I32: {
5674 .
addReg(NewAccumulator->getOperand(0).getReg());
5677 case AMDGPU::S_ADD_U64_PSEUDO:
5678 case AMDGPU::S_SUB_U64_PSEUDO: {
5679 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5680 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5682 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5684 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5685 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5686 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5688 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5690 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5694 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5697 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5699 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5701 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5704 .
addReg(NewAccumulator->getOperand(0).getReg())
5714 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5716 : NewAccumulator->getOperand(0).getReg();
5727 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5733 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5739 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5746 case AMDGPU::V_ADD_F32_e64:
5747 case AMDGPU::V_SUB_F32_e64: {
5749 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5750 Register DstVreg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5754 .
addReg(NewAccumulator->getOperand(0).getReg())
5759 unsigned srcMod =
Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5767 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5796 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5797 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5798 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5799 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5800 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5801 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5802 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5804 bool IsWave32 = ST.isWave32();
5805 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5806 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5813 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5817 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5826 I = ComputeLoop->begin();
5828 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5832 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5836 I = ComputeLoop->end();
5839 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5843 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5849 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5850 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5852 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
5862 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5863 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5872 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5874 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5875 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5878 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5880 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5882 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5884 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5888 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5892 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5893 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5899 case AMDGPU::S_OR_B64:
5900 case AMDGPU::S_AND_B64:
5901 case AMDGPU::S_XOR_B64: {
5904 .
addReg(LaneValue->getOperand(0).getReg())
5908 case AMDGPU::V_CMP_GT_I64_e64:
5909 case AMDGPU::V_CMP_GT_U64_e64:
5910 case AMDGPU::V_CMP_LT_I64_e64:
5911 case AMDGPU::V_CMP_LT_U64_e64: {
5912 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5914 MRI.createVirtualRegister(WaveMaskRegClass);
5917 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5918 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5921 VregClass, AMDGPU::sub0, VSubRegClass);
5924 VregClass, AMDGPU::sub1, VSubRegClass);
5925 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5932 .
addReg(LaneValue->getOperand(0).getReg())
5933 .
addReg(AccumulatorVReg);
5935 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5936 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5940 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5941 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5942 .
addReg(LaneValue->getOperand(0).getReg())
5946 case AMDGPU::S_ADD_U64_PSEUDO:
5947 case AMDGPU::S_SUB_U64_PSEUDO: {
5950 .
addReg(LaneValue->getOperand(0).getReg());
5957 unsigned BITSETOpc =
5958 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5959 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5965 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5968 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5970 .
addReg(NewActiveBitsReg)
5972 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5977 MI.eraseFromParent();
5992 switch (
MI.getOpcode()) {
5993 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5995 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5997 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5999 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6001 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6003 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6005 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6007 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6009 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6011 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6013 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6015 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6017 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6019 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6021 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6023 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6025 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6027 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6029 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6031 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6033 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6035 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6037 case AMDGPU::S_UADDO_PSEUDO:
6038 case AMDGPU::S_USUBO_PSEUDO: {
6044 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6046 : AMDGPU::S_SUB_U32;
6054 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6057 MI.eraseFromParent();
6060 case AMDGPU::S_ADD_U64_PSEUDO:
6061 case AMDGPU::S_SUB_U64_PSEUDO: {
6064 case AMDGPU::V_ADD_U64_PSEUDO:
6065 case AMDGPU::V_SUB_U64_PSEUDO: {
6066 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6072 if (ST.hasAddSubU64Insts()) {
6074 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6075 : AMDGPU::V_SUB_U64_e64),
6080 TII->legalizeOperands(*
I);
6081 MI.eraseFromParent();
6085 if (IsAdd && ST.hasLshlAddU64Inst()) {
6091 TII->legalizeOperands(*
Add);
6092 MI.eraseFromParent();
6096 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6098 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6099 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6101 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6102 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6106 : &AMDGPU::VReg_64RegClass;
6109 : &AMDGPU::VReg_64RegClass;
6112 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6114 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6117 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6119 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6122 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6124 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6127 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6134 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6148 TII->legalizeOperands(*LoHalf);
6149 TII->legalizeOperands(*HiHalf);
6150 MI.eraseFromParent();
6153 case AMDGPU::S_ADD_CO_PSEUDO:
6154 case AMDGPU::S_SUB_CO_PSEUDO: {
6165 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6166 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6171 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6172 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6176 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6178 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6183 if (ST.isWave64()) {
6184 if (ST.hasScalarCompareEq64()) {
6191 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6193 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6195 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6196 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6198 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6212 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6213 ? AMDGPU::S_ADDC_U32
6214 : AMDGPU::S_SUBB_U32;
6219 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6225 MI.eraseFromParent();
6228 case AMDGPU::SI_INIT_M0: {
6231 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6234 MI.eraseFromParent();
6237 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6240 TII->get(AMDGPU::S_CMP_EQ_U32))
6245 case AMDGPU::GET_GROUPSTATICSIZE: {
6249 .
add(
MI.getOperand(0))
6251 MI.eraseFromParent();
6254 case AMDGPU::GET_SHADERCYCLESHILO: {
6267 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6269 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6270 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6272 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6273 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6275 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6279 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6284 .
add(
MI.getOperand(0))
6289 MI.eraseFromParent();
6292 case AMDGPU::SI_INDIRECT_SRC_V1:
6293 case AMDGPU::SI_INDIRECT_SRC_V2:
6294 case AMDGPU::SI_INDIRECT_SRC_V3:
6295 case AMDGPU::SI_INDIRECT_SRC_V4:
6296 case AMDGPU::SI_INDIRECT_SRC_V5:
6297 case AMDGPU::SI_INDIRECT_SRC_V6:
6298 case AMDGPU::SI_INDIRECT_SRC_V7:
6299 case AMDGPU::SI_INDIRECT_SRC_V8:
6300 case AMDGPU::SI_INDIRECT_SRC_V9:
6301 case AMDGPU::SI_INDIRECT_SRC_V10:
6302 case AMDGPU::SI_INDIRECT_SRC_V11:
6303 case AMDGPU::SI_INDIRECT_SRC_V12:
6304 case AMDGPU::SI_INDIRECT_SRC_V16:
6305 case AMDGPU::SI_INDIRECT_SRC_V32:
6307 case AMDGPU::SI_INDIRECT_DST_V1:
6308 case AMDGPU::SI_INDIRECT_DST_V2:
6309 case AMDGPU::SI_INDIRECT_DST_V3:
6310 case AMDGPU::SI_INDIRECT_DST_V4:
6311 case AMDGPU::SI_INDIRECT_DST_V5:
6312 case AMDGPU::SI_INDIRECT_DST_V6:
6313 case AMDGPU::SI_INDIRECT_DST_V7:
6314 case AMDGPU::SI_INDIRECT_DST_V8:
6315 case AMDGPU::SI_INDIRECT_DST_V9:
6316 case AMDGPU::SI_INDIRECT_DST_V10:
6317 case AMDGPU::SI_INDIRECT_DST_V11:
6318 case AMDGPU::SI_INDIRECT_DST_V12:
6319 case AMDGPU::SI_INDIRECT_DST_V16:
6320 case AMDGPU::SI_INDIRECT_DST_V32:
6322 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6323 case AMDGPU::SI_KILL_I1_PSEUDO:
6325 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6329 Register SrcCond =
MI.getOperand(3).getReg();
6331 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6332 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6333 const auto *CondRC =
TRI->getWaveMaskRegClass();
6334 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6338 : &AMDGPU::VReg_64RegClass;
6341 : &AMDGPU::VReg_64RegClass;
6344 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6346 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6349 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6351 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6354 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6356 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6377 MI.eraseFromParent();
6380 case AMDGPU::SI_BR_UNDEF: {
6382 .
add(
MI.getOperand(0));
6384 MI.eraseFromParent();
6387 case AMDGPU::ADJCALLSTACKUP:
6388 case AMDGPU::ADJCALLSTACKDOWN: {
6395 case AMDGPU::SI_CALL_ISEL: {
6396 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6399 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6405 MI.eraseFromParent();
6408 case AMDGPU::V_ADD_CO_U32_e32:
6409 case AMDGPU::V_SUB_CO_U32_e32:
6410 case AMDGPU::V_SUBREV_CO_U32_e32: {
6412 unsigned Opc =
MI.getOpcode();
6414 bool NeedClampOperand =
false;
6415 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6417 NeedClampOperand =
true;
6421 if (
TII->isVOP3(*
I)) {
6424 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6425 if (NeedClampOperand)
6428 TII->legalizeOperands(*
I);
6430 MI.eraseFromParent();
6433 case AMDGPU::V_ADDC_U32_e32:
6434 case AMDGPU::V_SUBB_U32_e32:
6435 case AMDGPU::V_SUBBREV_U32_e32:
6438 TII->legalizeOperands(
MI);
6440 case AMDGPU::DS_GWS_INIT:
6441 case AMDGPU::DS_GWS_SEMA_BR:
6442 case AMDGPU::DS_GWS_BARRIER:
6443 case AMDGPU::DS_GWS_SEMA_V:
6444 case AMDGPU::DS_GWS_SEMA_P:
6445 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6453 case AMDGPU::S_SETREG_B32: {
6469 const unsigned SetMask = WidthMask <<
Offset;
6472 unsigned SetDenormOp = 0;
6473 unsigned SetRoundOp = 0;
6481 SetRoundOp = AMDGPU::S_ROUND_MODE;
6482 SetDenormOp = AMDGPU::S_DENORM_MODE;
6484 SetRoundOp = AMDGPU::S_ROUND_MODE;
6486 SetDenormOp = AMDGPU::S_DENORM_MODE;
6489 if (SetRoundOp || SetDenormOp) {
6491 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6492 unsigned ImmVal = Def->getOperand(1).getImm();
6506 MI.eraseFromParent();
6515 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6519 case AMDGPU::S_INVERSE_BALLOT_U32:
6520 case AMDGPU::S_INVERSE_BALLOT_U64:
6523 MI.setDesc(
TII->get(AMDGPU::COPY));
6525 case AMDGPU::ENDPGM_TRAP: {
6527 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6547 MI.eraseFromParent();
6550 case AMDGPU::SIMULATED_TRAP: {
6551 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6553 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6554 MI.eraseFromParent();
6557 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6558 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6564 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6565 Register OriginalExec = Setup->getOperand(0).getReg();
6567 MI.getOperand(0).setReg(OriginalExec);
6604 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6608 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6635 if (!Subtarget->hasMadMacF32Insts())
6636 return Subtarget->hasFastFMAF32();
6642 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6645 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6661 switch (Ty.getScalarSizeInBits()) {
6679 if (Ty.getScalarSizeInBits() == 16)
6681 if (Ty.getScalarSizeInBits() == 32)
6682 return Subtarget->hasMadMacF32Insts() &&
6692 EVT VT =
N->getValueType(0);
6694 return Subtarget->hasMadMacF32Insts() &&
6696 if (VT == MVT::f16) {
6697 return Subtarget->hasMadF16() &&
6712 unsigned Opc =
Op.getOpcode();
6713 EVT VT =
Op.getValueType();
6714 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6715 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6716 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6717 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6718 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6719 VT == MVT::v32bf16);
6735 [[maybe_unused]]
EVT VT =
Op.getValueType();
6737 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6738 VT == MVT::v16i32) &&
6739 "Unexpected ValueType.");
6748 unsigned Opc =
Op.getOpcode();
6749 EVT VT =
Op.getValueType();
6750 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6751 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6752 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6753 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6754 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6755 VT == MVT::v32bf16);
6763 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6765 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6772 unsigned Opc =
Op.getOpcode();
6773 EVT VT =
Op.getValueType();
6774 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6775 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6776 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6777 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6778 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6779 VT == MVT::v32bf16);
6784 : std::pair(Op0, Op0);
6793 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6795 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6801 switch (
Op.getOpcode()) {
6805 return LowerBRCOND(
Op, DAG);
6807 return LowerRETURNADDR(
Op, DAG);
6810 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6811 "Load should return a value and a chain");
6815 EVT VT =
Op.getValueType();
6817 return lowerFSQRTF32(
Op, DAG);
6819 return lowerFSQRTF64(
Op, DAG);
6824 return LowerTrig(
Op, DAG);
6826 return LowerSELECT(
Op, DAG);
6828 return LowerFDIV(
Op, DAG);
6830 return LowerFFREXP(
Op, DAG);
6832 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6834 return LowerSTORE(
Op, DAG);
6838 return LowerGlobalAddress(MFI,
Op, DAG);
6841 return LowerExternalSymbol(
Op, DAG);
6843 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6845 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6847 return LowerINTRINSIC_VOID(
Op, DAG);
6849 return lowerADDRSPACECAST(
Op, DAG);
6851 return lowerINSERT_SUBVECTOR(
Op, DAG);
6853 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6855 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6857 return lowerVECTOR_SHUFFLE(
Op, DAG);
6859 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6861 return lowerBUILD_VECTOR(
Op, DAG);
6864 return lowerFP_ROUND(
Op, DAG);
6866 return lowerTRAP(
Op, DAG);
6868 return lowerDEBUGTRAP(
Op, DAG);
6877 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6880 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6883 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6886 return lowerFLDEXP(
Op, DAG);
6892 Op.getValueType() == MVT::i16 &&
6893 Op.getOperand(0).getValueType() == MVT::f32) {
6917 return lowerFCOPYSIGN(
Op, DAG);
6919 return lowerMUL(
Op, DAG);
6922 return lowerXMULO(
Op, DAG);
6925 return lowerXMUL_LOHI(
Op, DAG);
6960 EVT FittingLoadVT = LoadVT;
6992SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6995 bool IsIntrinsic)
const {
6998 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6999 EVT LoadVT =
M->getValueType(0);
7001 EVT EquivLoadVT = LoadVT;
7015 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7019 M->getMemoryVT(),
M->getMemOperand());
7030 EVT LoadVT =
M->getValueType(0);
7036 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7037 bool IsTFE =
M->getNumValues() == 3;
7039 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7040 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7041 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7042 : AMDGPUISD::BUFFER_LOAD;
7045 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7050 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7054 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7055 M->getMemOperand(), DAG);
7059 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7061 M->getMemOperand(), DAG);
7069 EVT VT =
N->getValueType(0);
7070 unsigned CondCode =
N->getConstantOperandVal(3);
7081 EVT CmpVT =
LHS.getValueType();
7082 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7083 unsigned PromoteOp =
7103 EVT VT =
N->getValueType(0);
7105 unsigned CondCode =
N->getConstantOperandVal(3);
7114 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7123 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7132 EVT VT =
N->getValueType(0);
7156 Exec = AMDGPU::EXEC_LO;
7158 Exec = AMDGPU::EXEC;
7175 EVT VT =
N->getValueType(0);
7177 unsigned IID =
N->getConstantOperandVal(0);
7178 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7179 IID == Intrinsic::amdgcn_permlanex16;
7180 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7181 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7185 unsigned SplitSize = 32;
7186 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7187 ST->hasDPALU_DPP() &&
7195 case Intrinsic::amdgcn_permlane16:
7196 case Intrinsic::amdgcn_permlanex16:
7197 case Intrinsic::amdgcn_update_dpp:
7202 case Intrinsic::amdgcn_writelane:
7205 case Intrinsic::amdgcn_readlane:
7206 case Intrinsic::amdgcn_set_inactive:
7207 case Intrinsic::amdgcn_set_inactive_chain_arg:
7208 case Intrinsic::amdgcn_mov_dpp8:
7211 case Intrinsic::amdgcn_readfirstlane:
7212 case Intrinsic::amdgcn_permlane64:
7220 std::reverse(Operands.
begin(), Operands.
end());
7222 if (
SDNode *GL =
N->getGluedNode()) {
7224 GL = GL->getOperand(0).getNode();
7234 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7235 IID == Intrinsic::amdgcn_mov_dpp8 ||
7236 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7237 Src1 =
N->getOperand(2);
7238 if (IID == Intrinsic::amdgcn_writelane ||
7239 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7240 Src2 =
N->getOperand(3);
7243 if (ValSize == SplitSize) {
7253 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7258 if (IID == Intrinsic::amdgcn_writelane) {
7263 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7265 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7268 if (ValSize % SplitSize != 0)
7272 EVT VT =
N->getValueType(0);
7276 unsigned NumOperands =
N->getNumOperands();
7278 SDNode *GL =
N->getGluedNode();
7283 for (
unsigned i = 0; i != NE; ++i) {
7284 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7286 SDValue Operand =
N->getOperand(j);
7295 Operands[j] = Operand;
7300 Operands[NumOperands - 1] =
7316 if (SplitSize == 32) {
7318 return unrollLaneOp(LaneOp.
getNode());
7324 unsigned SubVecNumElt =
7328 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7329 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7333 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7338 if (IID == Intrinsic::amdgcn_writelane)
7343 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7344 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7345 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7346 EltIdx += SubVecNumElt;
7360 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7363 if (IID == Intrinsic::amdgcn_writelane)
7366 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7373 EVT VT =
N->getValueType(0);
7391 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
7395 Operands.
append(IntrinArgs);
7401 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7402 {ShiftedIndex, ValueI32});
7412 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7413 {ValueI32, PoisonVal});
7414 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7415 {ShiftedIndex, PoisonVal});
7418 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7421 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7422 {WWMIndex, WWMValue});
7423 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7424 MVT::i32, {WWMIndex, Swapped});
7426 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7434 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7442 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
7452 switch (
N->getOpcode()) {
7464 unsigned IID =
N->getConstantOperandVal(0);
7466 case Intrinsic::amdgcn_make_buffer_rsrc:
7467 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7469 case Intrinsic::amdgcn_cvt_pkrtz: {
7474 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7478 case Intrinsic::amdgcn_cvt_pknorm_i16:
7479 case Intrinsic::amdgcn_cvt_pknorm_u16:
7480 case Intrinsic::amdgcn_cvt_pk_i16:
7481 case Intrinsic::amdgcn_cvt_pk_u16: {
7487 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7488 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7489 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7490 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7491 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7492 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7494 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7496 EVT VT =
N->getValueType(0);
7505 case Intrinsic::amdgcn_s_buffer_load: {
7511 if (!Subtarget->hasScalarSubwordLoads())
7517 EVT VT =
Op.getValueType();
7518 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7530 if (!
Offset->isDivergent()) {
7549 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7554 case Intrinsic::amdgcn_dead: {
7555 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7566 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7567 Results.push_back(Res.getOperand(
I));
7571 Results.push_back(Res.getValue(1));
7580 EVT VT =
N->getValueType(0);
7585 EVT SelectVT = NewVT;
7586 if (NewVT.
bitsLT(MVT::i32)) {
7589 SelectVT = MVT::i32;
7595 if (NewVT != SelectVT)
7601 if (
N->getValueType(0) != MVT::v2f16)
7613 if (
N->getValueType(0) != MVT::v2f16)
7625 if (
N->getValueType(0) != MVT::f16)
7640 if (U.get() !=
Value)
7643 if (U.getUser()->getOpcode() == Opcode)
7649unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7652 case Intrinsic::amdgcn_if:
7653 return AMDGPUISD::IF;
7654 case Intrinsic::amdgcn_else:
7655 return AMDGPUISD::ELSE;
7656 case Intrinsic::amdgcn_loop:
7657 return AMDGPUISD::LOOP;
7658 case Intrinsic::amdgcn_end_cf:
7678 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7705 SDNode *Intr = BRCOND.getOperand(1).getNode();
7722 Intr =
LHS.getNode();
7730 assert(BR &&
"brcond missing unconditional branch user");
7735 unsigned CFNode = isCFIntrinsic(Intr);
7755 Ops.push_back(Target);
7778 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7797 MVT VT =
Op.getSimpleValueType();
7800 if (
Op.getConstantOperandVal(0) != 0)
7804 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7806 if (
Info->isEntryFunction())
7823 return Op.getValueType().bitsLE(VT)
7831 EVT DstVT =
Op.getValueType();
7838 unsigned Opc =
Op.getOpcode();
7850 EVT SrcVT = Src.getValueType();
7851 EVT DstVT =
Op.getValueType();
7854 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7857 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7864 if (DstVT == MVT::f16) {
7869 if (!Subtarget->has16BitInsts()) {
7874 if (
Op->getFlags().hasApproximateFuncs()) {
7885 "custom lower FP_ROUND for f16 or bf16");
7886 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7898 EVT VT =
Op.getValueType();
7900 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7901 bool IsIEEEMode =
Info->getMode().IEEE;
7910 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7917SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7919 EVT VT =
Op.getValueType();
7921 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7922 bool IsIEEEMode =
Info->getMode().IEEE;
7927 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7935 EVT VT =
Op.getValueType();
7939 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7940 !Subtarget->hasMinimum3Maximum3F16() &&
7941 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7942 "should not need to widen f16 minimum/maximum to v2f16");
7956 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7964 EVT VT =
Op.getValueType();
7968 EVT ExpVT =
Exp.getValueType();
7969 if (ExpVT == MVT::i16)
7990 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7997 switch (
Op->getOpcode()) {
8027 DAGCombinerInfo &DCI)
const {
8028 const unsigned Opc =
Op.getOpcode();
8036 :
Op->getOperand(0).getValueType();
8037 auto &DAG = DCI.DAG;
8040 if (DCI.isBeforeLegalizeOps() ||
8048 LHS =
Op->getOperand(1);
8049 RHS =
Op->getOperand(2);
8051 LHS =
Op->getOperand(0);
8052 RHS =
Op->getOperand(1);
8091 if (MagVT == SignVT)
8108 EVT VT =
Op.getValueType();
8114 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8141 if (
Op->isDivergent())
8154 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8156 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8159 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8161 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8167 EVT VT =
Op.getValueType();
8174 const APInt &
C = RHSC->getAPIntValue();
8176 if (
C.isPowerOf2()) {
8178 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8205 if (
Op->isDivergent()) {
8209 if (Subtarget->hasSMulHi()) {
8220 if (!Subtarget->isTrapHandlerEnabled() ||
8222 return lowerTrapEndpgm(
Op, DAG);
8224 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8225 : lowerTrapHsaQueuePtr(
Op, DAG);
8231 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8235SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8237 ImplicitParameter Param)
const {
8241 MachinePointerInfo PtrInfo =
8258 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8261 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8264 if (UserSGPR == AMDGPU::NoRegister) {
8281 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8290 if (Subtarget->hasPrivEnabledTrap2NopBug())
8291 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8295 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8303 if (!Subtarget->isTrapHandlerEnabled() ||
8307 "debugtrap handler not supported",
8315 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8318SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8320 if (Subtarget->hasApertureRegs()) {
8322 ? AMDGPU::SRC_SHARED_BASE
8323 : AMDGPU::SRC_PRIVATE_BASE;
8324 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8325 !Subtarget->hasGloballyAddressableScratch()) &&
8326 "Cannot use src_private_base with globally addressable scratch!");
8347 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8351 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8353 if (UserSGPR == AMDGPU::NoRegister) {
8398 const AMDGPUTargetMachine &TM =
8401 unsigned DestAS, SrcAS;
8403 bool IsNonNull =
false;
8405 SrcAS = ASC->getSrcAddressSpace();
8406 Src = ASC->getOperand(0);
8407 DestAS = ASC->getDestAddressSpace();
8410 Op.getConstantOperandVal(0) ==
8411 Intrinsic::amdgcn_addrspacecast_nonnull);
8412 Src =
Op->getOperand(1);
8413 SrcAS =
Op->getConstantOperandVal(2);
8414 DestAS =
Op->getConstantOperandVal(3);
8427 Subtarget->hasGloballyAddressableScratch()) {
8432 AMDGPU::S_MOV_B32, SL, MVT::i32,
8433 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8441 unsigned NullVal = TM.getNullPointerValue(DestAS);
8456 Subtarget->hasGloballyAddressableScratch()) {
8465 if (Subtarget->isWave64())
8471 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8479 AMDGPU::S_MOV_B64, SL, MVT::i64,
8480 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8482 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8484 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8492 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8504 Op.getValueType() == MVT::i64) {
8505 const SIMachineFunctionInfo *
Info =
8507 if (
Info->get32BitAddressHighBits() == 0)
8516 Src.getValueType() == MVT::i64)
8544 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8549 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8551 MVT::i32, InsNumElts / 2);
8556 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8558 if (InsNumElts == 2) {
8571 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8594 if (NumElts == 4 && EltSize == 16 && KIdx) {
8605 unsigned Idx = KIdx->getZExtValue();
8606 bool InsertLo = Idx < 2;
8610 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8616 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8629 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8664 EVT ResultVT =
Op.getValueType();
8677 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8680 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8684 if (VecSize == 128) {
8692 }
else if (VecSize == 256) {
8695 for (
unsigned P = 0;
P < 4; ++
P) {
8701 Parts[0], Parts[1]));
8703 Parts[2], Parts[3]));
8709 for (
unsigned P = 0;
P < 8; ++
P) {
8716 Parts[0], Parts[1], Parts[2], Parts[3]));
8719 Parts[4], Parts[5], Parts[6], Parts[7]));
8739 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8754 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8764 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8769 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8770 !(Mask[Elt + 1] & 1);
8776 EVT ResultVT =
Op.getValueType();
8779 const int NewSrcNumElts = 2;
8781 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8797 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8819 if (ShouldUseConsecutiveExtract &&
8822 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8823 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8835 if (Idx0 >= SrcNumElts) {
8840 if (Idx1 >= SrcNumElts) {
8845 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8846 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8854 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8855 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8860 if (SubVec0 != SubVec1) {
8861 NewMaskIdx1 += NewSrcNumElts;
8868 {NewMaskIdx0, NewMaskIdx1});
8873 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8874 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8875 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8876 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8895 EVT ResultVT =
Op.getValueType();
8911 EVT VT =
Op.getValueType();
8913 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8914 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8948 for (
unsigned P = 0;
P < NumParts; ++
P) {
8950 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8969 if (!Subtarget->isAmdHsaOS())
9012 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9021 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9029 EVT PtrVT =
Op.getValueType();
9031 const GlobalValue *GV = GSD->
getGlobal();
9045 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9060 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9063 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9064 if (Subtarget->has64BitLiterals()) {
9095 MachinePointerInfo PtrInfo =
9108 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9132 SDValue Param = lowerKernargMemParameter(
9143 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9151 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9159 unsigned NumElts = Elts.
size();
9161 if (NumElts <= 12) {
9170 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9176 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9186 EVT SrcVT = Src.getValueType();
9207 bool Unpacked,
bool IsD16,
int DMaskPop,
9208 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9212 EVT ReqRetVT = ResultTypes[0];
9214 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9215 ? (ReqRetNumElts + 1) / 2
9218 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9229 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9240 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9242 NumDataDwords - MaskPopDwords);
9247 EVT LegalReqRetVT = ReqRetVT;
9249 if (!
Data.getValueType().isInteger())
9251 Data.getValueType().changeTypeToInteger(),
Data);
9272 if (Result->getNumValues() == 1)
9279 SDValue *LWE,
bool &IsTexFail) {
9299 unsigned DimIdx,
unsigned EndIdx,
9300 unsigned NumGradients) {
9302 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9310 if (((
I + 1) >= EndIdx) ||
9311 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9312 I == DimIdx + NumGradients - 1))) {
9334 !
Op.getNode()->hasAnyUseOfValue(0))
9336 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9346 ResultTypes.erase(&ResultTypes[0]);
9352 int NumVDataDwords = 0;
9353 bool AdjustRetType =
false;
9354 bool IsAtomicPacked16Bit =
false;
9357 const unsigned ArgOffset = WithChain ? 2 : 1;
9360 unsigned DMaskLanes = 0;
9362 if (BaseOpcode->
Atomic) {
9363 VData =
Op.getOperand(2);
9365 IsAtomicPacked16Bit =
9366 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9367 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9368 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9369 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9380 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9382 DMask = Is64Bit ? 0xf : 0x3;
9383 NumVDataDwords = Is64Bit ? 4 : 2;
9385 DMask = Is64Bit ? 0x3 : 0x1;
9386 NumVDataDwords = Is64Bit ? 2 : 1;
9389 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9392 if (BaseOpcode->
Store) {
9393 VData =
Op.getOperand(2);
9397 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9401 VData = handleD16VData(VData, DAG,
true);
9404 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9405 }
else if (!BaseOpcode->
NoReturn) {
9410 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9418 (!LoadVT.
isVector() && DMaskLanes > 1))
9424 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9425 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9426 NumVDataDwords = (DMaskLanes + 1) / 2;
9428 NumVDataDwords = DMaskLanes;
9430 AdjustRetType =
true;
9434 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9441 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9442 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9444 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9446 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9447 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9451 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9457 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9461 "Bias needs to be converted to 16 bit in A16 mode");
9466 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9470 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9471 "require 16 bit args for both gradients and addresses");
9476 if (!
ST->hasA16()) {
9477 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9478 "support 16 bit addresses\n");
9488 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9490 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9492 IntrOpcode = G16MappingInfo->
G16;
9515 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9533 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9534 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9535 const bool UseNSA =
ST->hasNSAEncoding() &&
9536 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9537 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9538 const bool UsePartialNSA =
9539 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9542 if (UsePartialNSA) {
9544 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9545 }
else if (!UseNSA) {
9555 uint64_t UnormConst =
9556 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9558 Unorm = UnormConst ? True : False;
9564 bool IsTexFail =
false;
9565 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9576 NumVDataDwords += 1;
9577 AdjustRetType =
true;
9582 if (AdjustRetType) {
9585 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9594 MVT::i32, NumVDataDwords)
9597 ResultTypes[0] = NewVT;
9598 if (ResultTypes.size() == 3) {
9602 ResultTypes.erase(&ResultTypes[1]);
9616 Ops.push_back(VData);
9617 if (UsePartialNSA) {
9619 Ops.push_back(VAddr);
9623 Ops.push_back(VAddr);
9626 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9628 Ops.push_back(Rsrc);
9633 Ops.push_back(Samp);
9638 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9639 Ops.push_back(Unorm);
9641 Ops.push_back(IsA16 &&
9642 ST->hasFeature(AMDGPU::FeatureR128A16)
9646 Ops.push_back(IsA16 ? True : False);
9648 if (!Subtarget->hasGFX90AInsts())
9653 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9656 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9659 Ops.push_back(DimInfo->
DA ? True : False);
9661 Ops.push_back(IsD16 ? True : False);
9663 Ops.push_back(
Op.getOperand(0));
9665 int NumVAddrDwords =
9671 NumVDataDwords, NumVAddrDwords);
9672 }
else if (IsGFX11Plus) {
9674 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9675 : AMDGPU::MIMGEncGfx11Default,
9676 NumVDataDwords, NumVAddrDwords);
9677 }
else if (IsGFX10Plus) {
9679 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9680 : AMDGPU::MIMGEncGfx10Default,
9681 NumVDataDwords, NumVAddrDwords);
9683 if (Subtarget->hasGFX90AInsts()) {
9685 NumVDataDwords, NumVAddrDwords);
9689 "requested image instruction is not supported on this GPU",
9694 for (EVT VT : OrigResultTypes) {
9695 if (VT == MVT::Other)
9696 RetValues[Idx++] =
Op.getOperand(0);
9707 NumVDataDwords, NumVAddrDwords);
9710 NumVDataDwords, NumVAddrDwords);
9717 MachineMemOperand *MemRef = MemOp->getMemOperand();
9736 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9737 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9750 MachinePointerInfo(),
9755 if (!
Offset->isDivergent()) {
9762 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9771 !Subtarget->hasScalarDwordx3Loads()) {
9775 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
9798 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9800 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9804 unsigned NumLoads = 1;
9810 if (NumElts == 8 || NumElts == 16) {
9811 NumLoads = NumElts / 4;
9815 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9820 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9822 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9823 for (
unsigned i = 0; i < NumLoads; ++i) {
9825 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
9829 if (NumElts == 8 || NumElts == 16)
9837 if (!Subtarget->hasArchitectedSGPRs())
9842 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9849 unsigned Width)
const {
9851 using namespace AMDGPU::Hwreg;
9853 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9892 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9894 EVT VT =
Op.getValueType();
9896 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9900 switch (IntrinsicID) {
9901 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9904 return getPreloadedValue(DAG, *MFI, VT,
9907 case Intrinsic::amdgcn_dispatch_ptr:
9908 case Intrinsic::amdgcn_queue_ptr: {
9909 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9911 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9916 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9919 return getPreloadedValue(DAG, *MFI, VT, RegID);
9921 case Intrinsic::amdgcn_implicitarg_ptr: {
9923 return getImplicitArgPtr(DAG,
DL);
9924 return getPreloadedValue(DAG, *MFI, VT,
9927 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9933 return getPreloadedValue(DAG, *MFI, VT,
9936 case Intrinsic::amdgcn_dispatch_id: {
9939 case Intrinsic::amdgcn_rcp:
9940 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
9941 case Intrinsic::amdgcn_rsq:
9942 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
9943 case Intrinsic::amdgcn_rsq_legacy:
9947 case Intrinsic::amdgcn_rcp_legacy:
9950 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
9951 case Intrinsic::amdgcn_rsq_clamp: {
9953 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
9965 case Intrinsic::r600_read_ngroups_x:
9966 if (Subtarget->isAmdHsaOS())
9969 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9972 case Intrinsic::r600_read_ngroups_y:
9973 if (Subtarget->isAmdHsaOS())
9976 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9979 case Intrinsic::r600_read_ngroups_z:
9980 if (Subtarget->isAmdHsaOS())
9983 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9986 case Intrinsic::r600_read_local_size_x:
9987 if (Subtarget->isAmdHsaOS())
9990 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9992 case Intrinsic::r600_read_local_size_y:
9993 if (Subtarget->isAmdHsaOS())
9996 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9998 case Intrinsic::r600_read_local_size_z:
9999 if (Subtarget->isAmdHsaOS())
10002 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10004 case Intrinsic::amdgcn_workgroup_id_x:
10005 return lowerWorkGroupId(DAG, *MFI, VT,
10009 case Intrinsic::amdgcn_workgroup_id_y:
10010 return lowerWorkGroupId(DAG, *MFI, VT,
10014 case Intrinsic::amdgcn_workgroup_id_z:
10015 return lowerWorkGroupId(DAG, *MFI, VT,
10019 case Intrinsic::amdgcn_cluster_id_x:
10020 return Subtarget->hasClusters()
10021 ? getPreloadedValue(DAG, *MFI, VT,
10023 : DAG.getPOISON(VT);
10024 case Intrinsic::amdgcn_cluster_id_y:
10025 return Subtarget->hasClusters()
10026 ? getPreloadedValue(DAG, *MFI, VT,
10029 case Intrinsic::amdgcn_cluster_id_z:
10030 return Subtarget->hasClusters()
10031 ? getPreloadedValue(DAG, *MFI, VT,
10034 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10035 return Subtarget->hasClusters()
10036 ? getPreloadedValue(
10040 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10041 return Subtarget->hasClusters()
10042 ? getPreloadedValue(
10046 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10047 return Subtarget->hasClusters()
10048 ? getPreloadedValue(
10052 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10053 return Subtarget->hasClusters()
10056 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10057 return Subtarget->hasClusters()
10058 ? getPreloadedValue(
10062 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10063 return Subtarget->hasClusters()
10064 ? getPreloadedValue(
10068 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10069 return Subtarget->hasClusters()
10070 ? getPreloadedValue(
10074 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10075 return Subtarget->hasClusters()
10076 ? getPreloadedValue(
10080 case Intrinsic::amdgcn_wave_id:
10081 return lowerWaveID(DAG,
Op);
10082 case Intrinsic::amdgcn_lds_kernel_id: {
10084 return getLDSKernelId(DAG,
DL);
10085 return getPreloadedValue(DAG, *MFI, VT,
10088 case Intrinsic::amdgcn_workitem_id_x:
10089 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10090 case Intrinsic::amdgcn_workitem_id_y:
10091 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10092 case Intrinsic::amdgcn_workitem_id_z:
10093 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10094 case Intrinsic::amdgcn_wavefrontsize:
10096 SDLoc(
Op), MVT::i32);
10097 case Intrinsic::amdgcn_s_buffer_load: {
10098 unsigned CPol =
Op.getConstantOperandVal(3);
10105 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10106 Op.getOperand(3), DAG);
10108 case Intrinsic::amdgcn_fdiv_fast:
10109 return lowerFDIV_FAST(
Op, DAG);
10110 case Intrinsic::amdgcn_sin:
10111 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10113 case Intrinsic::amdgcn_cos:
10114 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10116 case Intrinsic::amdgcn_mul_u24:
10117 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10119 case Intrinsic::amdgcn_mul_i24:
10120 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10123 case Intrinsic::amdgcn_log_clamp: {
10129 case Intrinsic::amdgcn_fract:
10130 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10132 case Intrinsic::amdgcn_class:
10133 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10135 case Intrinsic::amdgcn_div_fmas:
10136 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10137 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10139 case Intrinsic::amdgcn_div_fixup:
10140 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10141 Op.getOperand(2),
Op.getOperand(3));
10143 case Intrinsic::amdgcn_div_scale: {
10149 SDValue Denominator =
Op.getOperand(2);
10156 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10158 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10159 Denominator, Numerator);
10161 case Intrinsic::amdgcn_icmp: {
10163 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10164 Op.getConstantOperandVal(2) == 0 &&
10169 case Intrinsic::amdgcn_fcmp: {
10172 case Intrinsic::amdgcn_ballot:
10174 case Intrinsic::amdgcn_fmed3:
10175 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10176 Op.getOperand(2),
Op.getOperand(3));
10177 case Intrinsic::amdgcn_fdot2:
10178 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10179 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10180 case Intrinsic::amdgcn_fmul_legacy:
10181 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10183 case Intrinsic::amdgcn_sffbh:
10184 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10185 case Intrinsic::amdgcn_sbfe:
10186 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10187 Op.getOperand(2),
Op.getOperand(3));
10188 case Intrinsic::amdgcn_ubfe:
10189 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10190 Op.getOperand(2),
Op.getOperand(3));
10191 case Intrinsic::amdgcn_cvt_pkrtz:
10192 case Intrinsic::amdgcn_cvt_pknorm_i16:
10193 case Intrinsic::amdgcn_cvt_pknorm_u16:
10194 case Intrinsic::amdgcn_cvt_pk_i16:
10195 case Intrinsic::amdgcn_cvt_pk_u16: {
10197 EVT VT =
Op.getValueType();
10200 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10201 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10202 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10203 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10204 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10205 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10206 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10207 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10209 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10212 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10215 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10218 case Intrinsic::amdgcn_fmad_ftz:
10219 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10220 Op.getOperand(2),
Op.getOperand(3));
10222 case Intrinsic::amdgcn_if_break:
10224 Op->getOperand(1),
Op->getOperand(2)),
10227 case Intrinsic::amdgcn_groupstaticsize: {
10233 const GlobalValue *GV =
10239 case Intrinsic::amdgcn_is_shared:
10240 case Intrinsic::amdgcn_is_private: {
10247 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10251 Subtarget->hasGloballyAddressableScratch()) {
10254 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10255 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10264 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10267 case Intrinsic::amdgcn_perm:
10268 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10269 Op.getOperand(2),
Op.getOperand(3));
10270 case Intrinsic::amdgcn_reloc_constant: {
10280 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10281 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10282 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10283 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10284 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10285 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10286 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10287 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10288 if (
Op.getOperand(4).getValueType() == MVT::i32)
10294 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10295 Op.getOperand(3), IndexKeyi32);
10297 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10298 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10299 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10300 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10301 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10302 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10303 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10304 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10305 if (
Op.getOperand(4).getValueType() == MVT::i64)
10311 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10312 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10313 Op.getOperand(6)});
10315 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10316 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10317 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10318 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10319 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10320 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10321 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10324 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10330 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10331 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
10332 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
10333 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10334 Args.push_back(
Op.getOperand(9));
10337 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10338 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10339 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10340 if (
Op.getOperand(6).getValueType() == MVT::i32)
10346 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10347 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10348 IndexKeyi32, Op.getOperand(7)});
10350 case Intrinsic::amdgcn_addrspacecast_nonnull:
10351 return lowerADDRSPACECAST(
Op, DAG);
10352 case Intrinsic::amdgcn_readlane:
10353 case Intrinsic::amdgcn_readfirstlane:
10354 case Intrinsic::amdgcn_writelane:
10355 case Intrinsic::amdgcn_permlane16:
10356 case Intrinsic::amdgcn_permlanex16:
10357 case Intrinsic::amdgcn_permlane64:
10358 case Intrinsic::amdgcn_set_inactive:
10359 case Intrinsic::amdgcn_set_inactive_chain_arg:
10360 case Intrinsic::amdgcn_mov_dpp8:
10361 case Intrinsic::amdgcn_update_dpp:
10363 case Intrinsic::amdgcn_dead: {
10365 for (
const EVT ValTy :
Op.getNode()->values())
10369 case Intrinsic::amdgcn_wave_shuffle:
10372 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10374 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10385 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10391 unsigned NewOpcode)
const {
10395 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10396 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10414 M->getMemOperand());
10419 unsigned NewOpcode)
const {
10423 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10424 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10442 M->getMemOperand());
10447 unsigned IntrID =
Op.getConstantOperandVal(1);
10451 case Intrinsic::amdgcn_ds_ordered_add:
10452 case Intrinsic::amdgcn_ds_ordered_swap: {
10457 unsigned IndexOperand =
M->getConstantOperandVal(7);
10458 unsigned WaveRelease =
M->getConstantOperandVal(8);
10459 unsigned WaveDone =
M->getConstantOperandVal(9);
10461 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10462 IndexOperand &= ~0x3f;
10463 unsigned CountDw = 0;
10466 CountDw = (IndexOperand >> 24) & 0xf;
10467 IndexOperand &= ~(0xf << 24);
10469 if (CountDw < 1 || CountDw > 4) {
10472 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10473 DL.getDebugLoc()));
10478 if (IndexOperand) {
10481 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10484 if (WaveDone && !WaveRelease) {
10488 Fn,
"ds_ordered_count: wave_done requires wave_release",
10489 DL.getDebugLoc()));
10492 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10493 unsigned ShaderType =
10495 unsigned Offset0 = OrderedCountIndex << 2;
10496 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10499 Offset1 |= (CountDw - 1) << 6;
10502 Offset1 |= ShaderType << 2;
10504 unsigned Offset = Offset0 | (Offset1 << 8);
10511 M->getVTList(),
Ops,
M->getMemoryVT(),
10512 M->getMemOperand());
10514 case Intrinsic::amdgcn_raw_buffer_load:
10515 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10516 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10517 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10518 case Intrinsic::amdgcn_raw_buffer_load_format:
10519 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10520 const bool IsFormat =
10521 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10522 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10524 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10525 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10539 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10541 case Intrinsic::amdgcn_struct_buffer_load:
10542 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10543 case Intrinsic::amdgcn_struct_buffer_load_format:
10544 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10545 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10546 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10547 const bool IsFormat =
10548 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10549 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10551 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10552 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10567 case Intrinsic::amdgcn_raw_tbuffer_load:
10568 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10570 EVT LoadVT =
Op.getValueType();
10571 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10572 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10588 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10590 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10591 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10594 case Intrinsic::amdgcn_struct_tbuffer_load:
10595 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10597 EVT LoadVT =
Op.getValueType();
10598 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10599 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10615 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10617 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10618 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10621 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10623 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10624 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10625 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10626 return lowerStructBufferAtomicIntrin(
Op, DAG,
10627 AMDGPUISD::BUFFER_ATOMIC_FADD);
10628 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10629 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10630 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10631 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10632 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10633 return lowerStructBufferAtomicIntrin(
Op, DAG,
10634 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10635 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10636 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10637 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10638 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10640 return lowerStructBufferAtomicIntrin(
Op, DAG,
10641 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10642 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10644 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10645 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10646 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10647 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10648 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10650 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10651 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10653 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10654 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10655 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10656 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10657 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10658 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10659 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10660 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10661 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10662 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10663 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10664 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10665 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10666 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10667 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10668 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10669 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10670 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10671 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10672 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10673 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10674 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10675 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10676 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10677 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10678 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10679 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10680 return lowerStructBufferAtomicIntrin(
Op, DAG,
10681 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10682 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10683 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10684 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10685 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10686 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10687 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10688 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10689 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10690 return lowerStructBufferAtomicIntrin(
Op, DAG,
10691 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10692 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10693 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10694 return lowerStructBufferAtomicIntrin(
Op, DAG,
10695 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10696 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10697 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10698 return lowerStructBufferAtomicIntrin(
Op, DAG,
10699 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10700 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10701 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10702 return lowerStructBufferAtomicIntrin(
Op, DAG,
10703 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10704 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10705 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10706 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10707 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10708 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10709 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10710 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10711 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10712 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10713 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10714 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10715 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10716 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10717 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10718 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10719 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10720 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10721 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10722 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10723 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10724 return lowerStructBufferAtomicIntrin(
Op, DAG,
10725 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10726 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10727 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10728 return lowerRawBufferAtomicIntrin(
Op, DAG,
10729 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10730 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10731 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10732 return lowerStructBufferAtomicIntrin(
Op, DAG,
10733 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10734 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10735 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10736 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10737 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10751 EVT VT =
Op.getValueType();
10755 Op->getVTList(),
Ops, VT,
10756 M->getMemOperand());
10758 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10759 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10760 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10761 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10775 EVT VT =
Op.getValueType();
10779 Op->getVTList(),
Ops, VT,
10780 M->getMemOperand());
10782 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10783 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10785 SDValue NodePtr =
M->getOperand(2);
10786 SDValue RayExtent =
M->getOperand(3);
10787 SDValue InstanceMask =
M->getOperand(4);
10788 SDValue RayOrigin =
M->getOperand(5);
10789 SDValue RayDir =
M->getOperand(6);
10791 SDValue TDescr =
M->getOperand(8);
10796 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10801 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10802 const unsigned NumVDataDwords = 10;
10803 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10805 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10806 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10807 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10811 Ops.push_back(NodePtr);
10814 {DAG.getBitcast(MVT::i32, RayExtent),
10815 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10816 Ops.push_back(RayOrigin);
10817 Ops.push_back(RayDir);
10818 Ops.push_back(Offsets);
10819 Ops.push_back(TDescr);
10820 Ops.push_back(
M->getChain());
10823 MachineMemOperand *MemRef =
M->getMemOperand();
10827 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10829 SDValue NodePtr =
M->getOperand(2);
10830 SDValue RayExtent =
M->getOperand(3);
10831 SDValue RayOrigin =
M->getOperand(4);
10832 SDValue RayDir =
M->getOperand(5);
10833 SDValue RayInvDir =
M->getOperand(6);
10834 SDValue TDescr =
M->getOperand(7);
10841 if (!Subtarget->hasGFX10_AEncoding()) {
10851 const unsigned NumVDataDwords = 4;
10852 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10853 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10854 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10857 const unsigned BaseOpcodes[2][2] = {
10858 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10859 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10860 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10864 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10865 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10866 : AMDGPU::MIMGEncGfx10NSA,
10867 NumVDataDwords, NumVAddrDwords);
10871 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10872 : AMDGPU::MIMGEncGfx10Default,
10873 NumVDataDwords, NumVAddrDwords);
10879 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10882 if (Lanes[0].getValueSizeInBits() == 32) {
10883 for (
unsigned I = 0;
I < 3; ++
I)
10890 Ops.push_back(Lanes[2]);
10902 if (UseNSA && IsGFX11Plus) {
10903 Ops.push_back(NodePtr);
10905 Ops.push_back(RayOrigin);
10910 for (
unsigned I = 0;
I < 3; ++
I) {
10913 {DirLanes[I], InvDirLanes[I]})));
10917 Ops.push_back(RayDir);
10918 Ops.push_back(RayInvDir);
10925 Ops.push_back(NodePtr);
10928 packLanes(RayOrigin,
true);
10929 packLanes(RayDir,
true);
10930 packLanes(RayInvDir,
false);
10935 if (NumVAddrDwords > 12) {
10937 Ops.append(16 -
Ops.size(), Undef);
10943 Ops.push_back(MergedOps);
10946 Ops.push_back(TDescr);
10948 Ops.push_back(
M->getChain());
10951 MachineMemOperand *MemRef =
M->getMemOperand();
10955 case Intrinsic::amdgcn_global_atomic_fmin_num:
10956 case Intrinsic::amdgcn_global_atomic_fmax_num:
10957 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10958 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10965 unsigned Opcode = 0;
10967 case Intrinsic::amdgcn_global_atomic_fmin_num:
10968 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10972 case Intrinsic::amdgcn_global_atomic_fmax_num:
10973 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10980 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10981 Ops,
M->getMemOperand());
10983 case Intrinsic::amdgcn_s_get_barrier_state:
10984 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10991 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10992 BarID = (BarID >> 4) & 0x3F;
10993 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10996 Ops.push_back(Chain);
10998 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10999 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11007 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11015 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11016 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11017 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11021 EVT VT =
Op->getValueType(0);
11027 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11029 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11037SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11044 EVT VT = VTList.
VTs[0];
11047 bool IsTFE = VTList.
NumVTs == 3;
11050 unsigned NumOpDWords = NumValueDWords + 1;
11052 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11053 MachineMemOperand *OpDWordsMMO =
11055 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11056 OpDWordsVT, OpDWordsMMO, DAG);
11061 NumValueDWords == 1
11070 if (!Subtarget->hasDwordx3LoadStores() &&
11071 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11075 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11077 WidenedMemVT, WidenedMMO);
11087 bool ImageStore)
const {
11097 if (Subtarget->hasUnpackedD16VMem()) {
11111 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11122 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11128 if ((NumElements % 2) == 1) {
11130 unsigned I = Elts.
size() / 2;
11146 if (NumElements == 3) {
11167 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11170 switch (IntrinsicID) {
11171 case Intrinsic::amdgcn_exp_compr: {
11172 if (!Subtarget->hasCompressedExport()) {
11175 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11197 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11201 case Intrinsic::amdgcn_struct_tbuffer_store:
11202 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11204 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11206 VData = handleD16VData(VData, DAG);
11207 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11208 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11222 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11223 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11226 M->getMemoryVT(),
M->getMemOperand());
11229 case Intrinsic::amdgcn_raw_tbuffer_store:
11230 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11232 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11234 VData = handleD16VData(VData, DAG);
11235 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11236 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11250 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11251 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11254 M->getMemoryVT(),
M->getMemOperand());
11257 case Intrinsic::amdgcn_raw_buffer_store:
11258 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11259 case Intrinsic::amdgcn_raw_buffer_store_format:
11260 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11261 const bool IsFormat =
11262 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11263 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11270 VData = handleD16VData(VData, DAG);
11280 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11281 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11295 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11296 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11301 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11304 M->getMemoryVT(),
M->getMemOperand());
11307 case Intrinsic::amdgcn_struct_buffer_store:
11308 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11309 case Intrinsic::amdgcn_struct_buffer_store_format:
11310 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11311 const bool IsFormat =
11312 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11313 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11321 VData = handleD16VData(VData, DAG);
11331 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11332 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11346 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11347 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11351 EVT VDataType = VData.getValueType().getScalarType();
11353 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11356 M->getMemoryVT(),
M->getMemOperand());
11358 case Intrinsic::amdgcn_raw_buffer_load_lds:
11359 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11360 case Intrinsic::amdgcn_struct_buffer_load_lds:
11361 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11362 if (!Subtarget->hasVMemToLDSLoad())
11366 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11367 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11368 unsigned OpOffset = HasVIndex ? 1 : 0;
11369 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11371 unsigned Size =
Op->getConstantOperandVal(4);
11377 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11378 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11379 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11380 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11384 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11385 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11386 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11389 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11390 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11391 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11392 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11395 if (!Subtarget->hasLDSLoadB96_B128())
11397 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11398 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11399 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11400 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11403 if (!Subtarget->hasLDSLoadB96_B128())
11405 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11406 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11407 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11408 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11416 if (HasVIndex && HasVOffset)
11420 else if (HasVIndex)
11421 Ops.push_back(
Op.getOperand(5));
11422 else if (HasVOffset)
11423 Ops.push_back(VOffset);
11425 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11426 Ops.push_back(Rsrc);
11427 Ops.push_back(
Op.getOperand(6 + OpOffset));
11428 Ops.push_back(
Op.getOperand(7 + OpOffset));
11430 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11443 MachineMemOperand *LoadMMO =
M->getMemOperand();
11448 MachinePointerInfo StorePtrI = LoadPtrI;
11472 case Intrinsic::amdgcn_load_to_lds:
11473 case Intrinsic::amdgcn_global_load_lds: {
11474 if (!Subtarget->hasVMemToLDSLoad())
11478 unsigned Size =
Op->getConstantOperandVal(4);
11483 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11486 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11492 if (!Subtarget->hasLDSLoadB96_B128())
11494 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11497 if (!Subtarget->hasLDSLoadB96_B128())
11499 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11515 if (
LHS->isDivergent())
11519 RHS.getOperand(0).getValueType() == MVT::i32) {
11522 VOffset =
RHS.getOperand(0);
11526 Ops.push_back(Addr);
11534 Ops.push_back(VOffset);
11537 Ops.push_back(
Op.getOperand(5));
11539 unsigned Aux =
Op.getConstantOperandVal(6);
11547 MachineMemOperand *LoadMMO =
M->getMemOperand();
11549 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11550 MachinePointerInfo StorePtrI = LoadPtrI;
11569 case Intrinsic::amdgcn_end_cf:
11571 Op->getOperand(2), Chain),
11573 case Intrinsic::amdgcn_s_barrier_init:
11574 case Intrinsic::amdgcn_s_barrier_signal_var: {
11581 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11582 ? AMDGPU::S_BARRIER_INIT_M0
11583 : AMDGPU::S_BARRIER_SIGNAL_M0;
11598 constexpr unsigned ShAmt = 16;
11605 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11610 case Intrinsic::amdgcn_s_wakeup_barrier: {
11611 if (!Subtarget->hasSWakeupBarrier())
11615 case Intrinsic::amdgcn_s_barrier_join: {
11624 switch (IntrinsicID) {
11627 case Intrinsic::amdgcn_s_barrier_join:
11628 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11630 case Intrinsic::amdgcn_s_wakeup_barrier:
11631 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11635 unsigned BarID = (BarVal >> 4) & 0x3F;
11638 Ops.push_back(Chain);
11640 switch (IntrinsicID) {
11643 case Intrinsic::amdgcn_s_barrier_join:
11644 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11646 case Intrinsic::amdgcn_s_wakeup_barrier:
11647 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11658 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11664 case Intrinsic::amdgcn_s_prefetch_data: {
11667 return Op.getOperand(0);
11670 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11672 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11679 Op->getVTList(),
Ops,
M->getMemoryVT(),
11680 M->getMemOperand());
11682 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11683 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11684 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11693 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11695 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11711 return PtrVT == MVT::i64;
11725std::pair<SDValue, SDValue>
11755 unsigned Overflow = ImmOffset & ~MaxImm;
11756 ImmOffset -= Overflow;
11757 if ((int32_t)Overflow < 0) {
11758 Overflow += ImmOffset;
11763 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11782void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11784 Align Alignment)
const {
11786 SDLoc
DL(CombinedOffset);
11788 uint32_t
Imm =
C->getZExtValue();
11789 uint32_t SOffset, ImmOffset;
11790 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11800 uint32_t SOffset, ImmOffset;
11803 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11811 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11820SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11823 return MaybePointer;
11837 SDValue NumRecords =
Op->getOperand(3);
11843 if (Subtarget->has45BitNumRecordsBufferResource()) {
11862 SDValue ExtShiftedStrideVec =
11874 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11876 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11881 auto [LowHalf, HighHalf] =
11882 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11892 NumRecords, Flags);
11904 bool IsTFE)
const {
11909 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11910 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11913 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11925 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11926 : AMDGPUISD::BUFFER_LOAD_USHORT;
11928 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11942 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11946 Ops[1] = BufferStoreExt;
11947 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11948 : AMDGPUISD::BUFFER_STORE_SHORT;
11951 M->getMemOperand());
11976 DAGCombinerInfo &DCI)
const {
11977 SelectionDAG &DAG = DCI.DAG;
11992 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11999 "unexpected vector extload");
12012 "unexpected fp extload");
12030 DCI.AddToWorklist(Cvt.
getNode());
12035 DCI.AddToWorklist(Cvt.
getNode());
12046 if (
Info.isEntryFunction())
12047 return Info.getUserSGPRInfo().hasFlatScratchInit();
12055 EVT MemVT =
Load->getMemoryVT();
12056 MachineMemOperand *MMO =
Load->getMemOperand();
12068 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12096 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12097 "Custom lowering for non-i32 vectors hasn't been implemented.");
12100 unsigned AS =
Load->getAddressSpace();
12107 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12111 !Subtarget->hasMultiDwordFlatScratchAddressing())
12121 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12124 Alignment >=
Align(4) && NumElements < 32) {
12126 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12138 if (NumElements > 4)
12141 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12151 switch (Subtarget->getMaxPrivateElementSize()) {
12157 if (NumElements > 2)
12162 if (NumElements > 4)
12165 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12174 auto Flags =
Load->getMemOperand()->getFlags();
12176 Load->getAlign(), Flags, &
Fast) &&
12185 MemVT, *
Load->getMemOperand())) {
12194 EVT VT =
Op.getValueType();
12231 EVT VT =
Op.getValueType();
12232 const SDNodeFlags
Flags =
Op->getFlags();
12234 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12240 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12243 if (CLHS->isExactlyValue(1.0)) {
12256 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12260 if (CLHS->isExactlyValue(-1.0)) {
12263 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12269 if (!AllowInaccurateRcp &&
12270 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12284 EVT VT =
Op.getValueType();
12285 const SDNodeFlags
Flags =
Op->getFlags();
12287 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12288 if (!AllowInaccurateDiv)
12309 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12319 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12323 return DAG.
getNode(Opcode, SL, VTList,
12332 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12342 Opcode = AMDGPUISD::FMA_W_CHAIN;
12346 return DAG.
getNode(Opcode, SL, VTList,
12352 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12353 return FastLowered;
12356 EVT VT =
Op.getValueType();
12363 if (VT == MVT::bf16) {
12386 unsigned FMADOpCode =
12390 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12393 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12395 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12396 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12406 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12412 SDNodeFlags
Flags =
Op->getFlags();
12422 const APFloat K0Val(0x1p+96f);
12425 const APFloat K1Val(0x1p-32f);
12452 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12453 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12454 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12459 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12460 return FastLowered;
12466 SDNodeFlags
Flags =
Op->getFlags();
12467 Flags.setNoFPExcept(
true);
12475 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12484 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12488 using namespace AMDGPU::Hwreg;
12489 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12493 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12494 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12497 const bool HasDynamicDenormals =
12503 if (!PreservesDenormals) {
12508 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12511 if (HasDynamicDenormals) {
12515 SavedDenormMode =
SDValue(GetReg, 0);
12521 SDNode *EnableDenorm;
12522 if (Subtarget->hasDenormModeInst()) {
12523 const SDValue EnableDenormValue =
12526 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12530 const SDValue EnableDenormValue =
12532 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12533 {EnableDenormValue,
BitField, Glue});
12543 ApproxRcp, One, NegDivScale0, Flags);
12546 ApproxRcp, Fma0, Flags);
12552 NumeratorScaled,
Mul, Flags);
12558 NumeratorScaled, Fma3, Flags);
12560 if (!PreservesDenormals) {
12561 SDNode *DisableDenorm;
12562 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12566 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12568 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12572 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12573 const SDValue DisableDenormValue =
12574 HasDynamicDenormals
12579 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12590 {Fma4, Fma1, Fma3, Scale},
Flags);
12592 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12596 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12597 return FastLowered;
12605 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12611 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12629 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12659 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12661 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12665 EVT VT =
Op.getValueType();
12667 if (VT == MVT::f32)
12668 return LowerFDIV32(
Op, DAG);
12670 if (VT == MVT::f64)
12671 return LowerFDIV64(
Op, DAG);
12673 if (VT == MVT::f16 || VT == MVT::bf16)
12674 return LowerFDIV16(
Op, DAG);
12683 EVT ResultExpVT =
Op->getValueType(1);
12684 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12694 if (Subtarget->hasFractBug()) {
12712 EVT VT =
Store->getMemoryVT();
12714 if (VT == MVT::i1) {
12718 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12722 Store->getValue().getValueType().getScalarType() == MVT::i32);
12724 unsigned AS =
Store->getAddressSpace();
12732 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12736 !Subtarget->hasMultiDwordFlatScratchAddressing())
12743 if (NumElements > 4)
12746 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12750 VT, *
Store->getMemOperand()))
12756 switch (Subtarget->getMaxPrivateElementSize()) {
12760 if (NumElements > 2)
12764 if (NumElements > 4 ||
12765 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12773 auto Flags =
Store->getMemOperand()->getFlags();
12792 assert(!Subtarget->has16BitInsts());
12793 SDNodeFlags
Flags =
Op->getFlags();
12807 SDNodeFlags
Flags =
Op->getFlags();
12808 MVT VT =
Op.getValueType().getSimpleVT();
12916 SDNodeFlags
Flags =
Op->getFlags();
12979 EVT VT =
Op.getValueType();
12989 if (Subtarget->hasTrigReducedRange()) {
12991 TrigVal = DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags);
12996 switch (
Op.getOpcode()) {
12998 return DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
13000 return DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13023 EVT VT =
Op.getValueType();
13031 Op->getVTList(),
Ops, VT,
13040SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13041 DAGCombinerInfo &DCI)
const {
13042 EVT VT =
N->getValueType(0);
13044 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13047 SelectionDAG &DAG = DCI.DAG;
13051 EVT SrcVT = Src.getValueType();
13057 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13060 DCI.AddToWorklist(Cvt.
getNode());
13063 if (ScalarVT != MVT::f32) {
13075 DAGCombinerInfo &DCI)
const {
13086 SelectionDAG &DAG = DCI.DAG;
13105 for (
unsigned I = 0;
I != NumElts; ++
I) {
13129 if (NewElts.
size() == 1)
13151 for (
unsigned I = 0;
I != NumElts; ++
I) {
13186SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13188 DAGCombinerInfo &DCI)
const {
13205 SelectionDAG &DAG = DCI.DAG;
13218 AM.BaseOffs =
Offset.getSExtValue();
13223 EVT VT =
N->getValueType(0);
13229 Flags.setNoUnsignedWrap(
13230 N->getFlags().hasNoUnsignedWrap() &&
13242 switch (
N->getOpcode()) {
13253 DAGCombinerInfo &DCI)
const {
13254 SelectionDAG &DAG = DCI.DAG;
13261 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13262 N->getMemoryVT(), DCI);
13266 NewOps[PtrIdx] = NewPtr;
13275 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13276 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13285SDValue SITargetLowering::splitBinaryBitConstantOp(
13289 uint32_t ValLo =
Lo_32(Val);
13290 uint32_t ValHi =
Hi_32(Val);
13297 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13311 if (V.getValueType() != MVT::i1)
13313 switch (V.getOpcode()) {
13318 case AMDGPUISD::FP_CLASS:
13330 return V.getResNo() == 1;
13332 unsigned IntrinsicID = V.getConstantOperandVal(0);
13333 switch (IntrinsicID) {
13334 case Intrinsic::amdgcn_is_shared:
13335 case Intrinsic::amdgcn_is_private:
13352 if (!(
C & 0x000000ff))
13353 ZeroByteMask |= 0x000000ff;
13354 if (!(
C & 0x0000ff00))
13355 ZeroByteMask |= 0x0000ff00;
13356 if (!(
C & 0x00ff0000))
13357 ZeroByteMask |= 0x00ff0000;
13358 if (!(
C & 0xff000000))
13359 ZeroByteMask |= 0xff000000;
13360 uint32_t NonZeroByteMask = ~ZeroByteMask;
13361 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13374 assert(V.getValueSizeInBits() == 32);
13376 if (V.getNumOperands() != 2)
13385 switch (V.getOpcode()) {
13390 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13395 return (0x03020100 & ~ConstMask) | ConstMask;
13402 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13408 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13415 DAGCombinerInfo &DCI)
const {
13416 if (DCI.isBeforeLegalize())
13419 SelectionDAG &DAG = DCI.DAG;
13420 EVT VT =
N->getValueType(0);
13425 if (VT == MVT::i64 && CRHS) {
13427 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13431 if (CRHS && VT == MVT::i32) {
13441 unsigned Shift = CShift->getZExtValue();
13443 unsigned Offset = NB + Shift;
13444 if ((
Offset & (Bits - 1)) == 0) {
13447 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13468 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13470 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13483 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13488 if (
X !=
LHS.getOperand(1))
13492 const ConstantFPSDNode *C1 =
13509 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13515 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13518 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13526 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13527 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13529 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13530 :
Mask->getZExtValue() & OrdMask;
13533 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13551 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13554 if (LHSMask != ~0u && RHSMask != ~0u) {
13557 if (LHSMask > RHSMask) {
13564 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13565 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13568 if (!(LHSUsedLanes & RHSUsedLanes) &&
13571 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13577 uint32_t
Mask = LHSMask & RHSMask;
13578 for (
unsigned I = 0;
I < 32;
I += 8) {
13579 uint32_t ByteSel = 0xff <<
I;
13580 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13581 Mask &= (0x0c <<
I) & 0xffffffff;
13586 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13589 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13639static const std::optional<ByteProvider<SDValue>>
13641 unsigned Depth = 0) {
13644 return std::nullopt;
13646 if (
Op.getValueSizeInBits() < 8)
13647 return std::nullopt;
13649 if (
Op.getValueType().isVector())
13652 switch (
Op->getOpcode()) {
13664 NarrowVT = VTSign->getVT();
13667 return std::nullopt;
13670 if (SrcIndex >= NarrowByteWidth)
13671 return std::nullopt;
13679 return std::nullopt;
13681 uint64_t BitShift = ShiftOp->getZExtValue();
13683 if (BitShift % 8 != 0)
13684 return std::nullopt;
13686 SrcIndex += BitShift / 8;
13704static const std::optional<ByteProvider<SDValue>>
13706 unsigned StartingIndex = 0) {
13710 return std::nullopt;
13712 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13714 return std::nullopt;
13716 return std::nullopt;
13718 bool IsVec =
Op.getValueType().isVector();
13719 switch (
Op.getOpcode()) {
13722 return std::nullopt;
13727 return std::nullopt;
13731 return std::nullopt;
13734 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13735 return std::nullopt;
13736 if (!
LHS ||
LHS->isConstantZero())
13738 if (!
RHS ||
RHS->isConstantZero())
13740 return std::nullopt;
13745 return std::nullopt;
13749 return std::nullopt;
13751 uint32_t BitMask = BitMaskOp->getZExtValue();
13753 uint32_t IndexMask = 0xFF << (Index * 8);
13755 if ((IndexMask & BitMask) != IndexMask) {
13758 if (IndexMask & BitMask)
13759 return std::nullopt;
13768 return std::nullopt;
13772 if (!ShiftOp ||
Op.getValueType().isVector())
13773 return std::nullopt;
13775 uint64_t BitsProvided =
Op.getValueSizeInBits();
13776 if (BitsProvided % 8 != 0)
13777 return std::nullopt;
13779 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13781 return std::nullopt;
13783 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13784 uint64_t ByteShift = BitShift / 8;
13786 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13787 uint64_t BytesProvided = BitsProvided / 8;
13788 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13789 NewIndex %= BytesProvided;
13796 return std::nullopt;
13800 return std::nullopt;
13802 uint64_t BitShift = ShiftOp->getZExtValue();
13804 return std::nullopt;
13806 auto BitsProvided =
Op.getScalarValueSizeInBits();
13807 if (BitsProvided % 8 != 0)
13808 return std::nullopt;
13810 uint64_t BytesProvided = BitsProvided / 8;
13811 uint64_t ByteShift = BitShift / 8;
13816 return BytesProvided - ByteShift > Index
13824 return std::nullopt;
13828 return std::nullopt;
13830 uint64_t BitShift = ShiftOp->getZExtValue();
13831 if (BitShift % 8 != 0)
13832 return std::nullopt;
13833 uint64_t ByteShift = BitShift / 8;
13839 return Index < ByteShift
13842 Depth + 1, StartingIndex);
13851 return std::nullopt;
13859 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13861 if (NarrowBitWidth % 8 != 0)
13862 return std::nullopt;
13863 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13865 if (Index >= NarrowByteWidth)
13867 ? std::optional<ByteProvider<SDValue>>(
13875 return std::nullopt;
13879 if (NarrowByteWidth >= Index) {
13884 return std::nullopt;
13891 return std::nullopt;
13897 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13898 if (NarrowBitWidth % 8 != 0)
13899 return std::nullopt;
13900 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13905 if (Index >= NarrowByteWidth) {
13907 ? std::optional<ByteProvider<SDValue>>(
13912 if (NarrowByteWidth > Index) {
13916 return std::nullopt;
13921 return std::nullopt;
13924 Depth + 1, StartingIndex);
13930 return std::nullopt;
13931 auto VecIdx = IdxOp->getZExtValue();
13932 auto ScalarSize =
Op.getScalarValueSizeInBits();
13933 if (ScalarSize < 32)
13934 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13936 StartingIndex, Index);
13939 case AMDGPUISD::PERM: {
13941 return std::nullopt;
13945 return std::nullopt;
13948 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13949 if (IdxMask > 0x07 && IdxMask != 0x0c)
13950 return std::nullopt;
13952 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13953 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13955 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13961 return std::nullopt;
13976 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13983 auto MemVT = L->getMemoryVT();
13986 return L->getMemoryVT().getSizeInBits() == 16;
13996 int Low8 = Mask & 0xff;
13997 int Hi8 = (Mask & 0xff00) >> 8;
13999 assert(Low8 < 8 && Hi8 < 8);
14001 bool IsConsecutive = (Hi8 - Low8 == 1);
14006 bool Is16Aligned = !(Low8 % 2);
14008 return IsConsecutive && Is16Aligned;
14016 int Low16 = PermMask & 0xffff;
14017 int Hi16 = (PermMask & 0xffff0000) >> 16;
14027 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14029 if (!OtherOpIs16Bit)
14037 unsigned DWordOffset) {
14042 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14047 if (Src.getValueType().isVector()) {
14048 auto ScalarTySize = Src.getScalarValueSizeInBits();
14049 auto ScalarTy = Src.getValueType().getScalarType();
14050 if (ScalarTySize == 32) {
14054 if (ScalarTySize > 32) {
14057 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14058 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14065 assert(ScalarTySize < 32);
14066 auto NumElements =
TypeSize / ScalarTySize;
14067 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14068 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14069 auto NumElementsIn32 = 32 / ScalarTySize;
14070 auto NumAvailElements = DWordOffset < Trunc32Elements
14072 : NumElements - NormalizedTrunc;
14085 auto ShiftVal = 32 * DWordOffset;
14093 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14098 for (
int i = 0; i < 4; i++) {
14100 std::optional<ByteProvider<SDValue>>
P =
14103 if (!
P ||
P->isConstantZero())
14108 if (PermNodes.
size() != 4)
14111 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14112 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14114 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14115 auto PermOp = PermNodes[i];
14118 int SrcByteAdjust = 4;
14122 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14123 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14125 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14126 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14130 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14131 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14134 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14136 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14139 SDValue Op = *PermNodes[FirstSrc.first].Src;
14141 assert(
Op.getValueSizeInBits() == 32);
14145 int Low16 = PermMask & 0xffff;
14146 int Hi16 = (PermMask & 0xffff0000) >> 16;
14148 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14149 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14152 if (WellFormedLow && WellFormedHi)
14156 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14165 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14166 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14171 assert(
Op.getValueType().isByteSized() &&
14182 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14189 DAGCombinerInfo &DCI)
const {
14190 SelectionDAG &DAG = DCI.DAG;
14194 EVT VT =
N->getValueType(0);
14195 if (VT == MVT::i1) {
14197 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14198 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14200 if (Src !=
RHS.getOperand(0))
14205 if (!CLHS || !CRHS)
14209 static const uint32_t MaxMask = 0x3ff;
14214 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14223 LHS.getOpcode() == AMDGPUISD::PERM &&
14229 Sel |=
LHS.getConstantOperandVal(2);
14231 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14238 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14242 auto usesCombinedOperand = [](SDNode *OrUse) {
14245 !OrUse->getValueType(0).isVector())
14249 for (
auto *VUser : OrUse->users()) {
14250 if (!VUser->getValueType(0).isVector())
14257 if (VUser->getOpcode() == VectorwiseOp)
14263 if (!
any_of(
N->users(), usesCombinedOperand))
14269 if (LHSMask != ~0u && RHSMask != ~0u) {
14272 if (LHSMask > RHSMask) {
14279 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14280 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14283 if (!(LHSUsedLanes & RHSUsedLanes) &&
14286 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14288 LHSMask &= ~RHSUsedLanes;
14289 RHSMask &= ~LHSUsedLanes;
14291 LHSMask |= LHSUsedLanes & 0x04040404;
14293 uint32_t Sel = LHSMask | RHSMask;
14296 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14301 if (LHSMask == ~0u || RHSMask == ~0u) {
14342 return IdentitySrc;
14348 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14363 if (SrcVT == MVT::i32) {
14368 DCI.AddToWorklist(LowOr.
getNode());
14369 DCI.AddToWorklist(HiBits.getNode());
14380 N->getOperand(0), CRHS))
14388 DAGCombinerInfo &DCI)
const {
14389 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14396 SelectionDAG &DAG = DCI.DAG;
14398 EVT VT =
N->getValueType(0);
14399 if (CRHS && VT == MVT::i64) {
14401 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14408 unsigned Opc =
LHS.getOpcode();
14438 LHS->getOperand(0), FNegLHS, FNegRHS);
14447 DAGCombinerInfo &DCI)
const {
14448 if (!Subtarget->has16BitInsts() ||
14452 EVT VT =
N->getValueType(0);
14453 if (VT != MVT::i32)
14457 if (Src.getValueType() != MVT::i16)
14464SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14465 DAGCombinerInfo &DCI)
const {
14471 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14472 VTSign->getVT() == MVT::i8) ||
14473 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14474 VTSign->getVT() == MVT::i16))) {
14475 assert(Subtarget->hasScalarSubwordLoads() &&
14476 "s_buffer_load_{u8, i8} are supported "
14477 "in GFX12 (or newer) architectures.");
14478 EVT VT = Src.getValueType();
14479 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14480 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14481 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14483 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14490 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14491 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14495 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14496 VTSign->getVT() == MVT::i8) ||
14497 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14498 VTSign->getVT() == MVT::i16)) &&
14507 Src.getOperand(6), Src.getOperand(7)};
14510 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14511 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14512 ? AMDGPUISD::BUFFER_LOAD_BYTE
14513 : AMDGPUISD::BUFFER_LOAD_SHORT;
14514 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14515 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14516 return DCI.DAG.getMergeValues(
14517 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14523 DAGCombinerInfo &DCI)
const {
14524 SelectionDAG &DAG = DCI.DAG;
14531 if (
N->getOperand(0).isUndef())
14538 DAGCombinerInfo &DCI)
const {
14539 EVT VT =
N->getValueType(0);
14549 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14556 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14564 unsigned MaxDepth)
const {
14565 unsigned Opcode =
Op.getOpcode();
14570 const auto &
F = CFP->getValueAPF();
14571 if (
F.isNaN() &&
F.isSignaling())
14573 if (!
F.isDenormal())
14605 case AMDGPUISD::FMUL_LEGACY:
14606 case AMDGPUISD::FMAD_FTZ:
14607 case AMDGPUISD::RCP:
14608 case AMDGPUISD::RSQ:
14609 case AMDGPUISD::RSQ_CLAMP:
14610 case AMDGPUISD::RCP_LEGACY:
14611 case AMDGPUISD::RCP_IFLAG:
14612 case AMDGPUISD::LOG:
14613 case AMDGPUISD::EXP:
14614 case AMDGPUISD::DIV_SCALE:
14615 case AMDGPUISD::DIV_FMAS:
14616 case AMDGPUISD::DIV_FIXUP:
14617 case AMDGPUISD::FRACT:
14618 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14619 case AMDGPUISD::CVT_F32_UBYTE0:
14620 case AMDGPUISD::CVT_F32_UBYTE1:
14621 case AMDGPUISD::CVT_F32_UBYTE2:
14622 case AMDGPUISD::CVT_F32_UBYTE3:
14623 case AMDGPUISD::FP_TO_FP16:
14624 case AMDGPUISD::SIN_HW:
14625 case AMDGPUISD::COS_HW:
14636 if (
Op.getValueType() == MVT::i32) {
14642 if (RHS->getZExtValue() == 0xffff0000) {
14652 return Op.getValueType().getScalarType() != MVT::f16;
14662 case AMDGPUISD::CLAMP:
14663 case AMDGPUISD::FMED3:
14664 case AMDGPUISD::FMAX3:
14665 case AMDGPUISD::FMIN3:
14666 case AMDGPUISD::FMAXIMUM3:
14667 case AMDGPUISD::FMINIMUM3: {
14673 if (Subtarget->supportsMinMaxDenormModes() ||
14683 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14695 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14722 if (
Op.getValueType() == MVT::i16) {
14733 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14735 switch (IntrinsicID) {
14736 case Intrinsic::amdgcn_cvt_pkrtz:
14737 case Intrinsic::amdgcn_cubeid:
14738 case Intrinsic::amdgcn_frexp_mant:
14739 case Intrinsic::amdgcn_fdot2:
14740 case Intrinsic::amdgcn_rcp:
14741 case Intrinsic::amdgcn_rsq:
14742 case Intrinsic::amdgcn_rsq_clamp:
14743 case Intrinsic::amdgcn_rcp_legacy:
14744 case Intrinsic::amdgcn_rsq_legacy:
14745 case Intrinsic::amdgcn_trig_preop:
14746 case Intrinsic::amdgcn_tanh:
14747 case Intrinsic::amdgcn_log:
14748 case Intrinsic::amdgcn_exp2:
14749 case Intrinsic::amdgcn_sqrt:
14767 unsigned MaxDepth)
const {
14770 unsigned Opcode =
MI->getOpcode();
14772 if (Opcode == AMDGPU::G_FCANONICALIZE)
14775 std::optional<FPValueAndVReg> FCR;
14778 if (FCR->Value.isSignaling())
14780 if (!FCR->Value.isDenormal())
14791 case AMDGPU::G_FADD:
14792 case AMDGPU::G_FSUB:
14793 case AMDGPU::G_FMUL:
14794 case AMDGPU::G_FCEIL:
14795 case AMDGPU::G_FFLOOR:
14796 case AMDGPU::G_FRINT:
14797 case AMDGPU::G_FNEARBYINT:
14798 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14799 case AMDGPU::G_INTRINSIC_TRUNC:
14800 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14801 case AMDGPU::G_FMA:
14802 case AMDGPU::G_FMAD:
14803 case AMDGPU::G_FSQRT:
14804 case AMDGPU::G_FDIV:
14805 case AMDGPU::G_FREM:
14806 case AMDGPU::G_FPOW:
14807 case AMDGPU::G_FPEXT:
14808 case AMDGPU::G_FLOG:
14809 case AMDGPU::G_FLOG2:
14810 case AMDGPU::G_FLOG10:
14811 case AMDGPU::G_FPTRUNC:
14812 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14813 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14814 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14815 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14816 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14818 case AMDGPU::G_FNEG:
14819 case AMDGPU::G_FABS:
14820 case AMDGPU::G_FCOPYSIGN:
14822 case AMDGPU::G_FMINNUM:
14823 case AMDGPU::G_FMAXNUM:
14824 case AMDGPU::G_FMINNUM_IEEE:
14825 case AMDGPU::G_FMAXNUM_IEEE:
14826 case AMDGPU::G_FMINIMUM:
14827 case AMDGPU::G_FMAXIMUM:
14828 case AMDGPU::G_FMINIMUMNUM:
14829 case AMDGPU::G_FMAXIMUMNUM: {
14830 if (Subtarget->supportsMinMaxDenormModes() ||
14837 case AMDGPU::G_BUILD_VECTOR:
14842 case AMDGPU::G_INTRINSIC:
14843 case AMDGPU::G_INTRINSIC_CONVERGENT:
14845 case Intrinsic::amdgcn_fmul_legacy:
14846 case Intrinsic::amdgcn_fmad_ftz:
14847 case Intrinsic::amdgcn_sqrt:
14848 case Intrinsic::amdgcn_fmed3:
14849 case Intrinsic::amdgcn_sin:
14850 case Intrinsic::amdgcn_cos:
14851 case Intrinsic::amdgcn_log:
14852 case Intrinsic::amdgcn_exp2:
14853 case Intrinsic::amdgcn_log_clamp:
14854 case Intrinsic::amdgcn_rcp:
14855 case Intrinsic::amdgcn_rcp_legacy:
14856 case Intrinsic::amdgcn_rsq:
14857 case Intrinsic::amdgcn_rsq_clamp:
14858 case Intrinsic::amdgcn_rsq_legacy:
14859 case Intrinsic::amdgcn_div_scale:
14860 case Intrinsic::amdgcn_div_fmas:
14861 case Intrinsic::amdgcn_div_fixup:
14862 case Intrinsic::amdgcn_fract:
14863 case Intrinsic::amdgcn_cvt_pkrtz:
14864 case Intrinsic::amdgcn_cubeid:
14865 case Intrinsic::amdgcn_cubema:
14866 case Intrinsic::amdgcn_cubesc:
14867 case Intrinsic::amdgcn_cubetc:
14868 case Intrinsic::amdgcn_frexp_mant:
14869 case Intrinsic::amdgcn_fdot2:
14870 case Intrinsic::amdgcn_trig_preop:
14871 case Intrinsic::amdgcn_tanh:
14890 if (
C.isDenormal()) {
14904 if (
C.isSignaling()) {
14927SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14928 DAGCombinerInfo &DCI)
const {
14929 SelectionDAG &DAG = DCI.DAG;
14931 EVT VT =
N->getValueType(0);
14940 EVT VT =
N->getValueType(0);
14941 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14957 EVT EltVT =
Lo.getValueType();
14960 for (
unsigned I = 0;
I != 2; ++
I) {
14964 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14965 }
else if (
Op.isUndef()) {
15001 return AMDGPUISD::FMAX3;
15003 return AMDGPUISD::FMAXIMUM3;
15005 return AMDGPUISD::SMAX3;
15007 return AMDGPUISD::UMAX3;
15011 return AMDGPUISD::FMIN3;
15013 return AMDGPUISD::FMINIMUM3;
15015 return AMDGPUISD::SMIN3;
15017 return AMDGPUISD::UMIN3;
15038 if (!MinK || !MaxK)
15050 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15051 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15052 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15111 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15117 if (
Info->getMode().DX10Clamp) {
15126 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15158 case AMDGPUISD::FMIN_LEGACY:
15159 case AMDGPUISD::FMAX_LEGACY:
15160 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15171 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15180 DAGCombinerInfo &DCI)
const {
15181 SelectionDAG &DAG = DCI.DAG;
15213 if (
SDValue Med3 = performIntMed3ImmCombine(
15218 if (
SDValue Med3 = performIntMed3ImmCombine(
15224 if (
SDValue Med3 = performIntMed3ImmCombine(
15229 if (
SDValue Med3 = performIntMed3ImmCombine(
15242 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15243 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15244 (VT == MVT::f32 || VT == MVT::f64 ||
15245 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15246 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15247 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15248 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15250 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15257 const SDNodeFlags
Flags =
N->getFlags();
15259 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15262 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15272 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15273 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15282 DAGCombinerInfo &DCI)
const {
15283 EVT VT =
N->getValueType(0);
15287 SelectionDAG &DAG = DCI.DAG;
15298 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15302 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15306 if (
Info->getMode().DX10Clamp) {
15319 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15326 DAGCombinerInfo &DCI)
const {
15330 return DCI.DAG.getUNDEF(
N->getValueType(0));
15338 bool IsDivergentIdx,
15343 unsigned VecSize = EltSize * NumElem;
15346 if (VecSize <= 64 && EltSize < 32)
15355 if (IsDivergentIdx)
15359 unsigned NumInsts = NumElem +
15360 ((EltSize + 31) / 32) * NumElem ;
15364 if (Subtarget->useVGPRIndexMode())
15365 return NumInsts <= 16;
15369 if (Subtarget->hasMovrel())
15370 return NumInsts <= 15;
15376 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15391SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15392 DAGCombinerInfo &DCI)
const {
15398 EVT ResVT =
N->getValueType(0);
15422 if (!
C ||
C->getZExtValue() != 0x1f)
15438 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15466 DCI.AddToWorklist(Elt0.
getNode());
15467 DCI.AddToWorklist(Elt1.
getNode());
15489 if (!DCI.isBeforeLegalize())
15497 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15500 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15501 unsigned EltIdx = BitIndex / 32;
15502 unsigned LeftoverBitIdx = BitIndex % 32;
15506 DCI.AddToWorklist(Cast.
getNode());
15510 DCI.AddToWorklist(Elt.
getNode());
15513 DCI.AddToWorklist(Srl.
getNode());
15517 DCI.AddToWorklist(Trunc.
getNode());
15519 if (VecEltVT == ResVT) {
15531SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15532 DAGCombinerInfo &DCI)
const {
15543 SelectionDAG &DAG = DCI.DAG;
15563 Src.getOperand(0).getValueType() == MVT::f16) {
15564 return Src.getOperand(0);
15568 APFloat Val = CFP->getValueAPF();
15569 bool LosesInfo =
true;
15579 DAGCombinerInfo &DCI)
const {
15580 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15581 "combine only useful on gfx8");
15583 SDValue TruncSrc =
N->getOperand(0);
15584 EVT VT =
N->getValueType(0);
15585 if (VT != MVT::f16)
15588 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15592 SelectionDAG &DAG = DCI.DAG;
15623unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15625 const SDNode *N1)
const {
15630 if (((VT == MVT::f32 &&
15632 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15652 EVT VT =
N->getValueType(0);
15653 if (VT != MVT::i32 && VT != MVT::i64)
15659 unsigned Opc =
N->getOpcode();
15714 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15733 DAGCombinerInfo &DCI)
const {
15736 SelectionDAG &DAG = DCI.DAG;
15737 EVT VT =
N->getValueType(0);
15747 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15751 if (NumBits <= 32 || NumBits > 64)
15762 if (!Subtarget->hasFullRate64Ops()) {
15763 unsigned NumUsers = 0;
15764 for (SDNode *User :
LHS->
users()) {
15767 if (!
User->isAnyAdd())
15791 bool MulSignedLo =
false;
15792 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15801 if (VT != MVT::i64) {
15824 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15826 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15827 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15829 if (!MulLHSUnsigned32) {
15836 if (!MulRHSUnsigned32) {
15847 if (VT != MVT::i64)
15853SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15854 DAGCombinerInfo &DCI)
const {
15864 SelectionDAG &DAG = DCI.DAG;
15879 unsigned Opcode =
N->getOpcode();
15883 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15894static std::optional<ByteProvider<SDValue>>
15897 if (!Byte0 || Byte0->isConstantZero()) {
15898 return std::nullopt;
15901 if (Byte1 && !Byte1->isConstantZero()) {
15902 return std::nullopt;
15908 unsigned FirstCs =
First & 0x0c0c0c0c;
15909 unsigned SecondCs = Second & 0x0c0c0c0c;
15910 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15911 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15913 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15914 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15915 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15916 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15918 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15942 for (
int BPI = 0; BPI < 2; BPI++) {
15945 BPP = {Src1, Src0};
15947 unsigned ZeroMask = 0x0c0c0c0c;
15948 unsigned FMask = 0xFF << (8 * (3 - Step));
15950 unsigned FirstMask =
15951 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15952 unsigned SecondMask =
15953 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15957 int FirstGroup = -1;
15958 for (
int I = 0;
I < 2;
I++) {
15960 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15961 return IterElt.SrcOp == *BPP.first.Src &&
15962 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15966 if (Match != Srcs.
end()) {
15967 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15972 if (FirstGroup != -1) {
15974 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15975 return IterElt.SrcOp == *BPP.second.Src &&
15976 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15979 if (Match != Srcs.
end()) {
15980 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15982 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15990 unsigned ZeroMask = 0x0c0c0c0c;
15991 unsigned FMask = 0xFF << (8 * (3 - Step));
15995 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15999 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16008 if (Srcs.
size() == 1) {
16009 auto *Elt = Srcs.
begin();
16013 if (Elt->PermMask == 0x3020100)
16016 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16020 auto *FirstElt = Srcs.
begin();
16021 auto *SecondElt = std::next(FirstElt);
16028 auto FirstMask = FirstElt->PermMask;
16029 auto SecondMask = SecondElt->PermMask;
16031 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16032 unsigned FirstPlusFour = FirstMask | 0x04040404;
16035 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16047 FirstElt = std::next(SecondElt);
16048 if (FirstElt == Srcs.
end())
16051 SecondElt = std::next(FirstElt);
16054 if (SecondElt == Srcs.
end()) {
16059 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16060 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16066 return Perms.
size() == 2
16072 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16073 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16074 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16075 EntryMask += ZeroMask;
16080 auto Opcode =
Op.getOpcode();
16082 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16083 Opcode == AMDGPUISD::MUL_I24);
16086static std::optional<bool>
16097 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16100 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16102 assert(!(S0IsUnsigned && S0IsSigned));
16103 assert(!(S1IsUnsigned && S1IsSigned));
16111 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16117 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16118 return std::nullopt;
16130 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16131 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16136 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16142 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16143 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16144 return std::nullopt;
16150 DAGCombinerInfo &DCI)
const {
16151 SelectionDAG &DAG = DCI.DAG;
16152 EVT VT =
N->getValueType(0);
16158 if (Subtarget->hasMad64_32()) {
16159 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16164 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16168 if (VT == MVT::i64) {
16169 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16174 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16176 std::optional<bool> IsSigned;
16182 int ChainLength = 0;
16183 for (
int I = 0;
I < 4;
I++) {
16187 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16190 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16195 TempNode->getOperand(MulIdx), *Src0, *Src1,
16196 TempNode->getOperand(MulIdx)->getOperand(0),
16197 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16201 IsSigned = *IterIsSigned;
16202 if (*IterIsSigned != *IsSigned)
16205 auto AddIdx = 1 - MulIdx;
16208 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16209 Src2s.
push_back(TempNode->getOperand(AddIdx));
16219 TempNode->getOperand(AddIdx), *Src0, *Src1,
16220 TempNode->getOperand(AddIdx)->getOperand(0),
16221 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16225 if (*IterIsSigned != *IsSigned)
16229 ChainLength =
I + 2;
16233 TempNode = TempNode->getOperand(AddIdx);
16235 ChainLength =
I + 1;
16236 if (TempNode->getNumOperands() < 2)
16238 LHS = TempNode->getOperand(0);
16239 RHS = TempNode->getOperand(1);
16242 if (ChainLength < 2)
16248 if (ChainLength < 4) {
16258 bool UseOriginalSrc =
false;
16259 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16260 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16261 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16262 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16263 SmallVector<unsigned, 4> SrcBytes;
16264 auto Src0Mask = Src0s.
begin()->PermMask;
16265 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16266 bool UniqueEntries =
true;
16267 for (
auto I = 1;
I < 4;
I++) {
16268 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16271 UniqueEntries =
false;
16277 if (UniqueEntries) {
16278 UseOriginalSrc =
true;
16280 auto *FirstElt = Src0s.
begin();
16284 auto *SecondElt = Src1s.
begin();
16286 SecondElt->DWordOffset);
16295 if (!UseOriginalSrc) {
16302 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16305 : Intrinsic::amdgcn_udot4,
16315 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16320 unsigned Opc =
LHS.getOpcode();
16332 auto Cond =
RHS.getOperand(0);
16337 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16354 DAGCombinerInfo &DCI)
const {
16355 SelectionDAG &DAG = DCI.DAG;
16357 EVT VT =
N->getValueType(0);
16370 SDNodeFlags ShlFlags = N1->
getFlags();
16374 SDNodeFlags NewShlFlags =
16379 DCI.AddToWorklist(Inner.
getNode());
16386 if (Subtarget->hasMad64_32()) {
16387 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16396 if (VT == MVT::i64) {
16397 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16410 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16411 Y->isDivergent() !=
Z->isDivergent()) {
16420 if (
Y->isDivergent())
16423 SDNodeFlags ReassocFlags =
16426 DCI.AddToWorklist(UniformInner.
getNode());
16434 DAGCombinerInfo &DCI)
const {
16435 SelectionDAG &DAG = DCI.DAG;
16436 EVT VT =
N->getValueType(0);
16438 if (VT == MVT::i64) {
16439 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16443 if (VT != MVT::i32)
16452 unsigned Opc =
RHS.getOpcode();
16459 auto Cond =
RHS.getOperand(0);
16464 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16482SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16483 DAGCombinerInfo &DCI)
const {
16485 if (
N->getValueType(0) != MVT::i32)
16491 SelectionDAG &DAG = DCI.DAG;
16496 unsigned LHSOpc =
LHS.getOpcode();
16497 unsigned Opc =
N->getOpcode();
16501 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16507 DAGCombinerInfo &DCI)
const {
16511 SelectionDAG &DAG = DCI.DAG;
16512 EVT VT =
N->getValueType(0);
16524 if (
A ==
LHS.getOperand(1)) {
16525 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16526 if (FusedOp != 0) {
16528 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16536 if (
A ==
RHS.getOperand(1)) {
16537 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16538 if (FusedOp != 0) {
16540 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16549 DAGCombinerInfo &DCI)
const {
16553 SelectionDAG &DAG = DCI.DAG;
16555 EVT VT =
N->getValueType(0);
16568 if (
A ==
LHS.getOperand(1)) {
16569 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16570 if (FusedOp != 0) {
16574 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16583 if (
A ==
RHS.getOperand(1)) {
16584 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16585 if (FusedOp != 0) {
16587 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16596 DAGCombinerInfo &DCI)
const {
16597 SelectionDAG &DAG = DCI.DAG;
16599 EVT VT =
N->getValueType(0);
16608 SDNodeFlags
Flags =
N->getFlags();
16609 SDNodeFlags RHSFlags =
RHS->getFlags();
16615 bool IsNegative =
false;
16616 if (CLHS->isExactlyValue(1.0) ||
16617 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16623 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16633 DAGCombinerInfo &DCI)
const {
16634 SelectionDAG &DAG = DCI.DAG;
16635 EVT VT =
N->getValueType(0);
16639 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16640 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16655 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16660 const ConstantFPSDNode *FalseNode =
16670 if (ScalarVT == MVT::f32 &&
16676 if (TrueNodeExpVal == INT_MIN)
16679 if (FalseNodeExpVal == INT_MIN)
16699 DAGCombinerInfo &DCI)
const {
16700 SelectionDAG &DAG = DCI.DAG;
16701 EVT VT =
N->getValueType(0);
16704 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16722 (
N->getFlags().hasAllowContract() &&
16723 FMA->getFlags().hasAllowContract())) {
16757 if (Vec1 == Vec2 || Vec3 == Vec4)
16763 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16764 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16772 DAGCombinerInfo &DCI)
const {
16773 SelectionDAG &DAG = DCI.DAG;
16778 EVT VT =
LHS.getValueType();
16807 return LHS.getOperand(0);
16815 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16822 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16823 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16831 return LHS.getOperand(0);
16863 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16868 {Op0Hi, Op1Hi, CarryInHi});
16878 DCI.CombineTo(
LHS.getNode(), Result);
16882 if (VT != MVT::f32 && VT != MVT::f64 &&
16883 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16898 const unsigned IsInfMask =
16900 const unsigned IsFiniteMask =
16905 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
16914SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16915 DAGCombinerInfo &DCI)
const {
16916 SelectionDAG &DAG = DCI.DAG;
16918 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16937 unsigned ShiftOffset = 8 *
Offset;
16939 ShiftOffset -=
C->getZExtValue();
16941 ShiftOffset +=
C->getZExtValue();
16943 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16944 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16945 MVT::f32, Shifted);
16956 DCI.AddToWorklist(
N);
16963 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16969 DAGCombinerInfo &DCI)
const {
16974 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16978 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16979 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16982 APFloat One(
F.getSemantics(),
"1.0");
16984 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16990 DAGCombinerInfo &DCI)
const {
17011 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
17012 bool isInteger =
LHS.getValueType().isInteger();
17015 if (!isFloatingPoint && !isInteger)
17020 if (!isEquality && !isNonEquality)
17037 if (isFloatingPoint) {
17039 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17050 if (!(isEquality && TrueVal == ConstVal) &&
17051 !(isNonEquality && FalseVal == ConstVal))
17058 SelectLHS, SelectRHS);
17063 switch (
N->getOpcode()) {
17079 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17089 switch (
N->getOpcode()) {
17091 return performAddCombine(
N, DCI);
17093 return performPtrAddCombine(
N, DCI);
17095 return performSubCombine(
N, DCI);
17098 return performAddCarrySubCarryCombine(
N, DCI);
17100 return performFAddCombine(
N, DCI);
17102 return performFSubCombine(
N, DCI);
17104 return performFDivCombine(
N, DCI);
17106 return performFMulCombine(
N, DCI);
17108 return performSetCCCombine(
N, DCI);
17110 if (
auto Res = performSelectCombine(
N, DCI))
17125 case AMDGPUISD::FMIN_LEGACY:
17126 case AMDGPUISD::FMAX_LEGACY:
17127 return performMinMaxCombine(
N, DCI);
17129 return performFMACombine(
N, DCI);
17131 return performAndCombine(
N, DCI);
17133 return performOrCombine(
N, DCI);
17136 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17137 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17143 return performXorCombine(
N, DCI);
17145 return performZeroExtendCombine(
N, DCI);
17147 return performSignExtendInRegCombine(
N, DCI);
17148 case AMDGPUISD::FP_CLASS:
17149 return performClassCombine(
N, DCI);
17151 return performFCanonicalizeCombine(
N, DCI);
17152 case AMDGPUISD::RCP:
17153 return performRcpCombine(
N, DCI);
17155 case AMDGPUISD::FRACT:
17156 case AMDGPUISD::RSQ:
17157 case AMDGPUISD::RCP_LEGACY:
17158 case AMDGPUISD::RCP_IFLAG:
17159 case AMDGPUISD::RSQ_CLAMP: {
17168 return performUCharToFloatCombine(
N, DCI);
17170 return performFCopySignCombine(
N, DCI);
17171 case AMDGPUISD::CVT_F32_UBYTE0:
17172 case AMDGPUISD::CVT_F32_UBYTE1:
17173 case AMDGPUISD::CVT_F32_UBYTE2:
17174 case AMDGPUISD::CVT_F32_UBYTE3:
17175 return performCvtF32UByteNCombine(
N, DCI);
17176 case AMDGPUISD::FMED3:
17177 return performFMed3Combine(
N, DCI);
17178 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17179 return performCvtPkRTZCombine(
N, DCI);
17180 case AMDGPUISD::CLAMP:
17181 return performClampCombine(
N, DCI);
17184 EVT VT =
N->getValueType(0);
17187 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17190 EVT EltVT = Src.getValueType();
17191 if (EltVT != MVT::i16)
17201 return performExtractVectorEltCombine(
N, DCI);
17203 return performInsertVectorEltCombine(
N, DCI);
17205 return performFPRoundCombine(
N, DCI);
17214 return performMemSDNodeCombine(MemNode, DCI);
17245 unsigned Opcode =
Node->getMachineOpcode();
17248 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17249 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17252 SDNode *
Users[5] = {
nullptr};
17254 unsigned DmaskIdx =
17255 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17256 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17257 unsigned NewDmask = 0;
17258 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17259 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17260 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17261 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17262 unsigned TFCLane = 0;
17263 bool HasChain =
Node->getNumValues() > 1;
17265 if (OldDmask == 0) {
17273 TFCLane = OldBitsSet;
17277 for (SDUse &Use :
Node->uses()) {
17280 if (
Use.getResNo() != 0)
17283 SDNode *
User =
Use.getUser();
17286 if (!
User->isMachineOpcode() ||
17287 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17299 if (UsesTFC && Lane == TFCLane) {
17304 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17306 Dmask &= ~(1 << Comp);
17314 NewDmask |= 1 << Comp;
17319 bool NoChannels = !NewDmask;
17326 if (OldBitsSet == 1)
17332 if (NewDmask == OldDmask)
17341 unsigned NewChannels = BitsSet + UsesTFC;
17345 assert(NewOpcode != -1 &&
17346 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17347 "failed to find equivalent MIMG op");
17355 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17357 MVT ResultVT = NewChannels == 1
17360 : NewChannels == 5 ? 8
17362 SDVTList NewVTList =
17365 MachineSDNode *NewNode =
17374 if (NewChannels == 1) {
17384 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17389 if (i || !NoChannels)
17394 if (NewUser != User) {
17404 Idx = AMDGPU::sub1;
17407 Idx = AMDGPU::sub2;
17410 Idx = AMDGPU::sub3;
17413 Idx = AMDGPU::sub4;
17424 Op =
Op.getOperand(0);
17445 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17449 Node->getOperand(0), SL, VReg, SrcVal,
17455 return ToResultReg.
getNode();
17460 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17462 Ops.push_back(
Node->getOperand(i));
17468 Node->getOperand(i).getValueType(),
17469 Node->getOperand(i)),
17481 unsigned Opcode =
Node->getMachineOpcode();
17483 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17484 !
TII->isGather4(Opcode) &&
17486 return adjustWritemask(
Node, DAG);
17489 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17495 case AMDGPU::V_DIV_SCALE_F32_e64:
17496 case AMDGPU::V_DIV_SCALE_F64_e64: {
17506 (Src0 == Src1 || Src0 == Src2))
17562 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17563 unsigned InitIdx = 0;
17565 if (
TII->isImage(
MI)) {
17573 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17574 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17575 unsigned D16Val = D16 ? D16->getImm() : 0;
17577 if (!TFEVal && !LWEVal)
17588 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17590 unsigned dmask = MO_Dmask->
getImm();
17595 bool Packed = !Subtarget->hasUnpackedD16VMem();
17597 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17604 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
17605 if (DstSize < InitIdx)
17609 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
17617 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17618 unsigned NewDst = 0;
17623 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17624 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17627 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17628 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17648 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17660 if (
TII->isVOP3(
MI.getOpcode())) {
17662 TII->legalizeOperandsVOP3(
MRI,
MI);
17664 if (
TII->isMAI(
MI)) {
17669 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17670 AMDGPU::OpName::scale_src0);
17671 if (Src0Idx != -1) {
17672 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17673 AMDGPU::OpName::scale_src1);
17674 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17675 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17676 TII->legalizeOpWithMove(
MI, Src1Idx);
17683 if (
TII->isImage(
MI))
17684 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17758std::pair<unsigned, const TargetRegisterClass *>
17765 if (Constraint.
size() == 1) {
17769 if (VT == MVT::Other)
17772 switch (Constraint[0]) {
17779 RC = &AMDGPU::SReg_32RegClass;
17782 RC = &AMDGPU::SGPR_64RegClass;
17787 return std::pair(0U,
nullptr);
17794 return std::pair(0U,
nullptr);
17796 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17797 : &AMDGPU::VGPR_32_Lo256RegClass;
17800 RC = Subtarget->has1024AddressableVGPRs()
17801 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17804 return std::pair(0U,
nullptr);
17809 if (!Subtarget->hasMAIInsts())
17813 return std::pair(0U,
nullptr);
17815 RC = &AMDGPU::AGPR_32RegClass;
17820 return std::pair(0U,
nullptr);
17825 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17829 RC = &AMDGPU::AV_32RegClass;
17832 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17834 return std::pair(0U,
nullptr);
17843 return std::pair(0U, RC);
17846 if (Kind !=
'\0') {
17848 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17849 }
else if (Kind ==
's') {
17850 RC = &AMDGPU::SGPR_32RegClass;
17851 }
else if (Kind ==
'a') {
17852 RC = &AMDGPU::AGPR_32RegClass;
17858 return std::pair(0U,
nullptr);
17864 return std::pair(0U,
nullptr);
17868 RC =
TRI->getVGPRClassForBitWidth(Width);
17870 RC =
TRI->getSGPRClassForBitWidth(Width);
17872 RC =
TRI->getAGPRClassForBitWidth(Width);
17874 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17879 return std::pair(0U,
nullptr);
17881 return std::pair(Reg, RC);
17887 return std::pair(0U,
nullptr);
17888 if (Idx < RC->getNumRegs())
17890 return std::pair(0U,
nullptr);
17896 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17902 if (Constraint.
size() == 1) {
17903 switch (Constraint[0]) {
17913 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17921 if (Constraint.
size() == 1) {
17922 switch (Constraint[0]) {
17930 }
else if (Constraint.
size() == 2) {
17931 if (Constraint ==
"VA")
17949 std::vector<SDValue> &
Ops,
17964 unsigned Size =
Op.getScalarValueSizeInBits();
17968 if (
Size == 16 && !Subtarget->has16BitInsts())
17972 Val =
C->getSExtValue();
17976 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17980 if (
Size != 16 ||
Op.getNumOperands() != 2)
17982 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17985 Val =
C->getSExtValue();
17989 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17999 if (Constraint.
size() == 1) {
18000 switch (Constraint[0]) {
18015 }
else if (Constraint.
size() == 2) {
18016 if (Constraint ==
"DA") {
18017 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
18018 int64_t LoBits =
static_cast<int32_t
>(Val);
18022 if (Constraint ==
"DB") {
18030 unsigned MaxSize)
const {
18031 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
18032 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18034 MVT VT =
Op.getSimpleValueType();
18059 switch (UnalignedClassID) {
18060 case AMDGPU::VReg_64RegClassID:
18061 return AMDGPU::VReg_64_Align2RegClassID;
18062 case AMDGPU::VReg_96RegClassID:
18063 return AMDGPU::VReg_96_Align2RegClassID;
18064 case AMDGPU::VReg_128RegClassID:
18065 return AMDGPU::VReg_128_Align2RegClassID;
18066 case AMDGPU::VReg_160RegClassID:
18067 return AMDGPU::VReg_160_Align2RegClassID;
18068 case AMDGPU::VReg_192RegClassID:
18069 return AMDGPU::VReg_192_Align2RegClassID;
18070 case AMDGPU::VReg_224RegClassID:
18071 return AMDGPU::VReg_224_Align2RegClassID;
18072 case AMDGPU::VReg_256RegClassID:
18073 return AMDGPU::VReg_256_Align2RegClassID;
18074 case AMDGPU::VReg_288RegClassID:
18075 return AMDGPU::VReg_288_Align2RegClassID;
18076 case AMDGPU::VReg_320RegClassID:
18077 return AMDGPU::VReg_320_Align2RegClassID;
18078 case AMDGPU::VReg_352RegClassID:
18079 return AMDGPU::VReg_352_Align2RegClassID;
18080 case AMDGPU::VReg_384RegClassID:
18081 return AMDGPU::VReg_384_Align2RegClassID;
18082 case AMDGPU::VReg_512RegClassID:
18083 return AMDGPU::VReg_512_Align2RegClassID;
18084 case AMDGPU::VReg_1024RegClassID:
18085 return AMDGPU::VReg_1024_Align2RegClassID;
18086 case AMDGPU::AReg_64RegClassID:
18087 return AMDGPU::AReg_64_Align2RegClassID;
18088 case AMDGPU::AReg_96RegClassID:
18089 return AMDGPU::AReg_96_Align2RegClassID;
18090 case AMDGPU::AReg_128RegClassID:
18091 return AMDGPU::AReg_128_Align2RegClassID;
18092 case AMDGPU::AReg_160RegClassID:
18093 return AMDGPU::AReg_160_Align2RegClassID;
18094 case AMDGPU::AReg_192RegClassID:
18095 return AMDGPU::AReg_192_Align2RegClassID;
18096 case AMDGPU::AReg_256RegClassID:
18097 return AMDGPU::AReg_256_Align2RegClassID;
18098 case AMDGPU::AReg_512RegClassID:
18099 return AMDGPU::AReg_512_Align2RegClassID;
18100 case AMDGPU::AReg_1024RegClassID:
18101 return AMDGPU::AReg_1024_Align2RegClassID;
18117 if (Info->isEntryFunction()) {
18124 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18126 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18127 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18128 &AMDGPU::SGPR_64RegClass);
18129 Info->setSGPRForEXECCopy(SReg);
18131 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18132 Info->getStackPtrOffsetReg()));
18133 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18134 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18138 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18139 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18141 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18142 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18144 Info->limitOccupancy(MF);
18146 if (ST.isWave32() && !MF.
empty()) {
18147 for (
auto &
MBB : MF) {
18148 for (
auto &
MI :
MBB) {
18149 TII->fixImplicitOperands(
MI);
18159 if (ST.needsAlignedVGPRs()) {
18160 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18166 if (NewClassID != -1)
18167 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18176 const APInt &DemandedElts,
18178 unsigned Depth)
const {
18180 unsigned Opc =
Op.getOpcode();
18183 unsigned IID =
Op.getConstantOperandVal(0);
18185 case Intrinsic::amdgcn_mbcnt_lo:
18186 case Intrinsic::amdgcn_mbcnt_hi: {
18192 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18202 Op, Known, DemandedElts, DAG,
Depth);
18218 unsigned MaxValue =
18225 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18229 unsigned Src1Cst = 0;
18230 if (Src1.
isImm()) {
18231 Src1Cst = Src1.
getImm();
18232 }
else if (Src1.
isReg()) {
18236 Src1Cst = Cst->Value.getZExtValue();
18247 if (Width >= BFEWidth)
18256 Known = Known.
sext(BFEWidth);
18258 Known = Known.
zext(BFEWidth);
18264 unsigned Depth)
const {
18267 switch (
MI->getOpcode()) {
18268 case AMDGPU::S_BFE_I32:
18271 case AMDGPU::S_BFE_U32:
18274 case AMDGPU::S_BFE_I64:
18277 case AMDGPU::S_BFE_U64:
18280 case AMDGPU::G_INTRINSIC:
18281 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18284 case Intrinsic::amdgcn_workitem_id_x:
18287 case Intrinsic::amdgcn_workitem_id_y:
18290 case Intrinsic::amdgcn_workitem_id_z:
18293 case Intrinsic::amdgcn_mbcnt_lo:
18294 case Intrinsic::amdgcn_mbcnt_hi: {
18306 case Intrinsic::amdgcn_groupstaticsize: {
18317 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18320 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18323 case AMDGPU::G_AMDGPU_SMED3:
18324 case AMDGPU::G_AMDGPU_UMED3: {
18325 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18352 unsigned Depth)
const {
18359 AttributeList Attrs =
18361 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18388 if (Header->getAlignment() != PrefAlign)
18389 return Header->getAlignment();
18391 unsigned LoopSize = 0;
18396 LoopSize +=
MBB->getAlignment().value() / 2;
18399 LoopSize +=
TII->getInstSizeInBytes(
MI);
18400 if (LoopSize > 192)
18405 if (LoopSize <= 64)
18408 if (LoopSize <= 128)
18409 return CacheLineAlign;
18415 auto I = Exit->getFirstNonDebugInstr();
18416 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18417 return CacheLineAlign;
18426 if (PreTerm == Pre->
begin() ||
18427 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18431 auto ExitHead = Exit->getFirstNonDebugInstr();
18432 if (ExitHead == Exit->end() ||
18433 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18438 return CacheLineAlign;
18446 N =
N->getOperand(0).getNode();
18456 switch (
N->getOpcode()) {
18464 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18465 return !
TRI->isSGPRReg(
MRI, Reg);
18471 return !
TRI->isSGPRReg(
MRI, Reg);
18475 unsigned AS = L->getAddressSpace();
18485 case AMDGPUISD::ATOMIC_CMP_SWAP:
18486 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18487 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18488 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18489 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18490 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18491 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18492 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18493 case AMDGPUISD::BUFFER_ATOMIC_AND:
18494 case AMDGPUISD::BUFFER_ATOMIC_OR:
18495 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18496 case AMDGPUISD::BUFFER_ATOMIC_INC:
18497 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18498 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18499 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18500 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18501 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18507 return A->readMem() &&
A->writeMem();
18528 switch (Ty.getScalarSizeInBits()) {
18540 const APInt &DemandedElts,
18543 unsigned Depth)
const {
18544 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
18548 if (Info->getMode().DX10Clamp)
18560 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18580 <<
"Hardware instruction generated for atomic "
18582 <<
" operation at memory scope " << MemScope;
18587 Type *EltTy = VT->getElementType();
18588 return VT->getNumElements() == 2 &&
18608 unsigned BW =
IT->getBitWidth();
18609 return BW == 32 || BW == 64;
18623 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18624 return BW == 32 || BW == 64;
18627 if (Ty->isFloatTy() || Ty->isDoubleTy())
18631 return VT->getNumElements() == 2 &&
18632 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18642 bool HasSystemScope) {
18649 if (HasSystemScope) {
18658 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18671 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18697 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18710 bool HasSystemScope =
18742 if (!
IT ||
IT->getBitWidth() != 32)
18748 if (Subtarget->hasEmulatedSystemScopeAtomics())
18764 if (!HasSystemScope &&
18765 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18777 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18785 ConstVal && ConstVal->isNullValue())
18823 if (Ty->isFloatTy()) {
18828 if (Ty->isDoubleTy()) {
18849 if (Ty->isFloatTy() &&
18850 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18863 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18867 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18871 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18876 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18881 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18885 if (Ty->isFloatTy()) {
18888 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18891 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18896 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18904 if (Subtarget->hasFlatAtomicFaddF32Inst())
18913 if (Subtarget->hasLDSFPAtomicAddF32()) {
18914 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18916 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18944 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18946 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18950 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18952 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19005 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19006 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19007 : &AMDGPU::SReg_32RegClass;
19008 if (!
TRI->isSGPRClass(RC) && !isDivergent)
19009 return TRI->getEquivalentSGPRClass(RC);
19010 if (
TRI->isSGPRClass(RC) && isDivergent) {
19011 if (Subtarget->hasGFX90AInsts())
19012 return TRI->getEquivalentAVClass(RC);
19013 return TRI->getEquivalentVGPRClass(RC);
19026 unsigned WaveSize) {
19031 if (!
IT ||
IT->getBitWidth() != WaveSize)
19036 if (!Visited.
insert(V).second)
19038 bool Result =
false;
19039 for (
const auto *U : V->users()) {
19041 if (V == U->getOperand(1)) {
19046 case Intrinsic::amdgcn_if_break:
19047 case Intrinsic::amdgcn_if:
19048 case Intrinsic::amdgcn_else:
19053 if (V == U->getOperand(0)) {
19058 case Intrinsic::amdgcn_end_cf:
19059 case Intrinsic::amdgcn_loop:
19065 Result =
hasCFUser(U, Visited, WaveSize);
19074 const Value *V)
const {
19076 if (CI->isInlineAsm()) {
19085 for (
auto &TC : TargetConstraints) {
19099 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19127 return MRI.hasOneNonDBGUse(N0);
19134 if (
I.getMetadata(
"amdgpu.noclobber"))
19136 if (
I.getMetadata(
"amdgpu.last.use"))
19200 Alignment = RMW->getAlign();
19213 bool FullFlatEmulation =
19215 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19216 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19217 RMW->getType()->isDoubleTy()));
19220 bool ReturnValueIsUsed = !AI->
use_empty();
19229 if (FullFlatEmulation) {
19240 std::prev(BB->
end())->eraseFromParent();
19241 Builder.SetInsertPoint(BB);
19243 Value *LoadedShared =
nullptr;
19244 if (FullFlatEmulation) {
19245 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19246 {Addr},
nullptr,
"is.shared");
19247 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19248 Builder.SetInsertPoint(SharedBB);
19249 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19255 LoadedShared = Clone;
19257 Builder.CreateBr(PhiBB);
19258 Builder.SetInsertPoint(CheckPrivateBB);
19261 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19262 {Addr},
nullptr,
"is.private");
19263 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19265 Builder.SetInsertPoint(PrivateBB);
19267 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19270 Value *LoadedPrivate;
19272 LoadedPrivate = Builder.CreateAlignedLoad(
19273 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19276 LoadedPrivate, RMW->getValOperand());
19278 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19280 auto [ResultLoad, Equal] =
19286 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19289 Builder.CreateBr(PhiBB);
19291 Builder.SetInsertPoint(GlobalBB);
19295 if (FullFlatEmulation) {
19296 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19305 if (!FullFlatEmulation) {
19310 MDNode *RangeNotPrivate =
19313 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19317 Builder.CreateBr(PhiBB);
19319 Builder.SetInsertPoint(PhiBB);
19321 if (ReturnValueIsUsed) {
19324 if (FullFlatEmulation)
19325 Loaded->addIncoming(LoadedShared, SharedBB);
19326 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19327 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19328 Loaded->takeName(AI);
19331 Builder.CreateBr(ExitBB);
19335 unsigned PtrOpIdx) {
19336 Value *PtrOp =
I->getOperand(PtrOpIdx);
19343 I->setOperand(PtrOpIdx, ASCast);
19355 ConstVal && ConstVal->isNullValue()) {
19385 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19393 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19408 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
AMDGPUArgumentUsageInfo & getArgUsageInfo()
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const