44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
56#define DEBUG_TYPE "si-lower"
62 cl::desc(
"Do not align and prefetch loops"),
66 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
67 cl::desc(
"Use indirect register addressing for divergent indexes"),
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
84 return AMDGPU::SGPR0 +
Reg;
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
287 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
294 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
295 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
296 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
300 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
301 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
305 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
306 MVT::v3i16, MVT::v4i16, MVT::Other},
311 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
327 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
328 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
329 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
330 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
331 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
332 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
333 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
334 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
366 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
380 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
394 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
408 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
422 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
437 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
438 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
441 if (Subtarget->hasPkMovB32()) {
462 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
463 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
468 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
472 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
473 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
474 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
475 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
499 if (Subtarget->hasSMemRealTime() ||
504 if (Subtarget->has16BitInsts()) {
511 if (Subtarget->hasMadMacF32Insts())
528 if (Subtarget->hasIntClamp())
531 if (Subtarget->hasAddNoCarry())
537 {MVT::f32, MVT::f64},
Custom);
543 {MVT::f32, MVT::f64},
Legal);
545 if (Subtarget->haveRoundOpsF64())
568 if (Subtarget->has16BitInsts()) {
621 if (Subtarget->hasBF16TransInsts())
640 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
641 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
642 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
775 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
776 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
777 MVT::v32f16, MVT::v32bf16},
787 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
791 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
795 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
796 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
804 if (Subtarget->hasVOP3PInsts()) {
815 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
818 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
819 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
820 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
823 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
831 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
838 {MVT::v2f16, MVT::v4f16},
Custom);
844 if (Subtarget->hasBF16PackedInsts()) {
845 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
851 if (Subtarget->hasPackedFP32Ops()) {
855 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
862 if (Subtarget->has16BitInsts()) {
875 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
876 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
878 MVT::v32f16, MVT::v32bf16},
883 if (Subtarget->hasVectorMulU64())
885 else if (Subtarget->hasScalarSMulU64())
888 if (Subtarget->hasMad64_32())
891 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
894 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
896 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
899 if (Subtarget->hasMinimum3Maximum3F32())
902 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906 if (!Subtarget->hasMinimum3Maximum3F16())
911 if (Subtarget->hasVOP3PInsts()) {
914 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
918 if (Subtarget->hasIntMinMax64())
923 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
924 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
929 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
930 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
931 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
932 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
936 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
937 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
938 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
939 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
954 if (Subtarget->hasBF16ConversionInsts()) {
959 if (Subtarget->hasBF16PackedInsts()) {
965 if (Subtarget->hasBF16TransInsts()) {
969 if (Subtarget->hasCvtPkF16F32Inst()) {
971 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1021 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1075 EVT DestVT,
EVT SrcVT)
const {
1077 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1087 LLT DestTy,
LLT SrcTy)
const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1112 if (Subtarget->has16BitInsts())
1114 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1118 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1140 if (
Size == 16 && Subtarget->has16BitInsts())
1141 return (NumElts + 1) / 2;
1147 return NumElts * ((
Size + 31) / 32);
1156 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1164 if (
Size == 16 && Subtarget->has16BitInsts()) {
1166 IntermediateVT = RegisterVT;
1167 NumIntermediates = (NumElts + 1) / 2;
1168 return NumIntermediates;
1173 IntermediateVT = RegisterVT;
1174 NumIntermediates = NumElts;
1175 return NumIntermediates;
1180 RegisterVT = MVT::i16;
1181 IntermediateVT = ScalarVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1187 RegisterVT = MVT::i32;
1188 IntermediateVT = ScalarVT;
1189 NumIntermediates = NumElts;
1190 return NumIntermediates;
1194 RegisterVT = MVT::i32;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts * ((
Size + 31) / 32);
1197 return NumIntermediates;
1202 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1207 unsigned MaxNumLanes) {
1208 assert(MaxNumLanes != 0);
1212 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1223 unsigned MaxNumLanes) {
1229 assert(ST->getNumContainedTypes() == 2 &&
1230 ST->getContainedType(1)->isIntegerTy(32));
1244 return MVT::amdgpuBufferFatPointer;
1246 DL.getPointerSizeInBits(AS) == 192)
1247 return MVT::amdgpuBufferStridedPointer;
1256 DL.getPointerSizeInBits(AS) == 160) ||
1258 DL.getPointerSizeInBits(AS) == 192))
1265 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1266 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1267 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1269 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1270 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1271 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1272 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1273 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1275 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1276 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1277 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1278 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1279 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1284 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1285 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1324 unsigned IntrID)
const {
1326 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1344 if (RsrcIntr->IsImage) {
1359 Info.ptrVal = RsrcArg;
1362 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1371 if (RsrcIntr->IsImage) {
1372 unsigned MaxNumLanes = 4;
1387 std::numeric_limits<unsigned>::max());
1397 if (RsrcIntr->IsImage) {
1418 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1420 Info.memVT = MVT::i32;
1427 case Intrinsic::amdgcn_raw_buffer_load_lds:
1428 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1429 case Intrinsic::amdgcn_struct_buffer_load_lds:
1430 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1436 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1437 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1438 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1439 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1442 std::numeric_limits<unsigned>::max());
1452 case Intrinsic::amdgcn_ds_ordered_add:
1453 case Intrinsic::amdgcn_ds_ordered_swap: {
1466 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1467 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1470 Info.ptrVal =
nullptr;
1475 case Intrinsic::amdgcn_ds_append:
1476 case Intrinsic::amdgcn_ds_consume: {
1489 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1490 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1491 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1496 Info.memVT = MVT::i64;
1502 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1503 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1504 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1507 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1510 ->getElementType(0));
1518 case Intrinsic::amdgcn_global_atomic_fmin_num:
1519 case Intrinsic::amdgcn_global_atomic_fmax_num:
1520 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1521 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1522 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1532 case Intrinsic::amdgcn_flat_load_monitor_b32:
1533 case Intrinsic::amdgcn_flat_load_monitor_b64:
1534 case Intrinsic::amdgcn_flat_load_monitor_b128:
1535 case Intrinsic::amdgcn_global_load_monitor_b32:
1536 case Intrinsic::amdgcn_global_load_monitor_b64:
1537 case Intrinsic::amdgcn_global_load_monitor_b128:
1538 case Intrinsic::amdgcn_cluster_load_b32:
1539 case Intrinsic::amdgcn_cluster_load_b64:
1540 case Intrinsic::amdgcn_cluster_load_b128:
1541 case Intrinsic::amdgcn_ds_load_tr6_b96:
1542 case Intrinsic::amdgcn_ds_load_tr4_b64:
1543 case Intrinsic::amdgcn_ds_load_tr8_b64:
1544 case Intrinsic::amdgcn_ds_load_tr16_b128:
1545 case Intrinsic::amdgcn_global_load_tr6_b96:
1546 case Intrinsic::amdgcn_global_load_tr4_b64:
1547 case Intrinsic::amdgcn_global_load_tr_b64:
1548 case Intrinsic::amdgcn_global_load_tr_b128:
1549 case Intrinsic::amdgcn_ds_read_tr4_b64:
1550 case Intrinsic::amdgcn_ds_read_tr6_b96:
1551 case Intrinsic::amdgcn_ds_read_tr8_b64:
1552 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1560 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1561 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1562 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1570 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1571 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1572 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1580 case Intrinsic::amdgcn_ds_gws_init:
1581 case Intrinsic::amdgcn_ds_gws_barrier:
1582 case Intrinsic::amdgcn_ds_gws_sema_v:
1583 case Intrinsic::amdgcn_ds_gws_sema_br:
1584 case Intrinsic::amdgcn_ds_gws_sema_p:
1585 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1595 Info.memVT = MVT::i32;
1597 Info.align =
Align(4);
1599 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1605 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1606 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1607 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1608 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1609 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1610 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1611 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1612 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1619 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1620 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1621 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1622 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1629 case Intrinsic::amdgcn_load_to_lds:
1630 case Intrinsic::amdgcn_global_load_lds: {
1641 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1642 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1643 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1644 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1654 Info.memVT = MVT::i32;
1656 Info.align =
Align(4);
1661 case Intrinsic::amdgcn_s_prefetch_data:
1662 case Intrinsic::amdgcn_flat_prefetch:
1663 case Intrinsic::amdgcn_global_prefetch: {
1678 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1681 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1682 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1694 Type *&AccessTy)
const {
1695 Value *Ptr =
nullptr;
1696 switch (
II->getIntrinsicID()) {
1697 case Intrinsic::amdgcn_cluster_load_b128:
1698 case Intrinsic::amdgcn_cluster_load_b64:
1699 case Intrinsic::amdgcn_cluster_load_b32:
1700 case Intrinsic::amdgcn_ds_append:
1701 case Intrinsic::amdgcn_ds_consume:
1702 case Intrinsic::amdgcn_ds_load_tr8_b64:
1703 case Intrinsic::amdgcn_ds_load_tr16_b128:
1704 case Intrinsic::amdgcn_ds_load_tr4_b64:
1705 case Intrinsic::amdgcn_ds_load_tr6_b96:
1706 case Intrinsic::amdgcn_ds_read_tr4_b64:
1707 case Intrinsic::amdgcn_ds_read_tr6_b96:
1708 case Intrinsic::amdgcn_ds_read_tr8_b64:
1709 case Intrinsic::amdgcn_ds_read_tr16_b64:
1710 case Intrinsic::amdgcn_ds_ordered_add:
1711 case Intrinsic::amdgcn_ds_ordered_swap:
1712 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1713 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1714 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1715 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1716 case Intrinsic::amdgcn_flat_load_monitor_b128:
1717 case Intrinsic::amdgcn_flat_load_monitor_b32:
1718 case Intrinsic::amdgcn_flat_load_monitor_b64:
1719 case Intrinsic::amdgcn_global_atomic_fmax_num:
1720 case Intrinsic::amdgcn_global_atomic_fmin_num:
1721 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1722 case Intrinsic::amdgcn_global_load_monitor_b128:
1723 case Intrinsic::amdgcn_global_load_monitor_b32:
1724 case Intrinsic::amdgcn_global_load_monitor_b64:
1725 case Intrinsic::amdgcn_global_load_tr_b64:
1726 case Intrinsic::amdgcn_global_load_tr_b128:
1727 case Intrinsic::amdgcn_global_load_tr4_b64:
1728 case Intrinsic::amdgcn_global_load_tr6_b96:
1729 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1730 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1731 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1732 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1733 Ptr =
II->getArgOperand(0);
1735 case Intrinsic::amdgcn_load_to_lds:
1736 case Intrinsic::amdgcn_global_load_lds:
1737 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1738 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1739 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1740 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1741 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1742 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1743 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1744 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1745 Ptr =
II->getArgOperand(1);
1750 AccessTy =
II->getType();
1756 unsigned AddrSpace)
const {
1757 if (!Subtarget->hasFlatInstOffsets()) {
1768 return AM.
Scale == 0 &&
1769 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1770 AM.
BaseOffs, AddrSpace, FlatVariant));
1774 if (Subtarget->hasFlatGlobalInsts())
1777 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1790 return isLegalMUBUFAddressingMode(AM);
1793bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1804 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1816 if (AM.HasBaseReg) {
1848 return isLegalMUBUFAddressingMode(AM);
1850 if (!Subtarget->hasScalarSubwordLoads()) {
1855 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1903 return Subtarget->enableFlatScratch()
1905 : isLegalMUBUFAddressingMode(AM);
1952 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1961 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1964 Align RequiredAlignment(
1966 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1967 Alignment < RequiredAlignment)
1982 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
1988 RequiredAlignment =
Align(4);
1990 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2006 *IsFast = (Alignment >= RequiredAlignment) ? 64
2007 : (Alignment <
Align(4)) ? 32
2014 if (!Subtarget->hasDS96AndDS128())
2020 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2029 *IsFast = (Alignment >= RequiredAlignment) ? 96
2030 : (Alignment <
Align(4)) ? 32
2037 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2043 RequiredAlignment =
Align(8);
2045 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2054 *IsFast = (Alignment >= RequiredAlignment) ? 128
2055 : (Alignment <
Align(4)) ? 32
2072 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2074 return Alignment >= RequiredAlignment ||
2075 Subtarget->hasUnalignedDSAccessEnabled();
2083 bool AlignedBy4 = Alignment >=
Align(4);
2084 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2086 *IsFast = AlignedBy4 ?
Size : 1;
2091 *IsFast = AlignedBy4;
2102 return Alignment >=
Align(4) ||
2103 Subtarget->hasUnalignedBufferAccessEnabled();
2115 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2130 return Size >= 32 && Alignment >=
Align(4);
2135 unsigned *IsFast)
const {
2137 Alignment, Flags, IsFast);
2142 const AttributeList &FuncAttributes)
const {
2148 if (
Op.size() >= 16 &&
2152 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2170 unsigned DestAS)
const {
2173 Subtarget->hasGloballyAddressableScratch()) {
2203 unsigned Index)
const {
2219 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2254 auto [InputPtrReg, RC, ArgTy] =
2264 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2270 const SDLoc &SL)
const {
2277 const SDLoc &SL)
const {
2280 std::optional<uint32_t> KnownSize =
2282 if (KnownSize.has_value())
2308 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2317SDValue SITargetLowering::lowerKernargMemParameter(
2322 MachinePointerInfo PtrInfo =
2331 int64_t OffsetDiff =
Offset - AlignDownOffset;
2337 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2348 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2353 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2358 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2367 const SDLoc &SL)
const {
2436 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2439 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2440 if (ConvertedVal == ArgValue)
2441 return ConvertedVal;
2446SDValue SITargetLowering::lowerWorkGroupId(
2451 if (!Subtarget->hasClusters())
2452 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2460 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2461 SDLoc SL(ClusterIdXYZ);
2462 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2465 SDValue ClusterWorkGroupIdXYZ =
2466 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2476 return ClusterIdXYZ;
2478 using namespace AMDGPU::Hwreg;
2482 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2493SDValue SITargetLowering::getPreloadedValue(
2496 const ArgDescriptor *
Reg =
nullptr;
2497 const TargetRegisterClass *RC;
2501 const ArgDescriptor WorkGroupIDX =
2509 const ArgDescriptor WorkGroupIDZ =
2511 const ArgDescriptor ClusterWorkGroupIDX =
2513 const ArgDescriptor ClusterWorkGroupIDY =
2515 const ArgDescriptor ClusterWorkGroupIDZ =
2517 const ArgDescriptor ClusterWorkGroupMaxIDX =
2519 const ArgDescriptor ClusterWorkGroupMaxIDY =
2521 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2523 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2526 auto LoadConstant = [&](
unsigned N) {
2530 if (Subtarget->hasArchitectedSGPRs() &&
2537 Reg = &WorkGroupIDX;
2538 RC = &AMDGPU::SReg_32RegClass;
2542 Reg = &WorkGroupIDY;
2543 RC = &AMDGPU::SReg_32RegClass;
2547 Reg = &WorkGroupIDZ;
2548 RC = &AMDGPU::SReg_32RegClass;
2552 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2553 return LoadConstant(0);
2554 Reg = &ClusterWorkGroupIDX;
2555 RC = &AMDGPU::SReg_32RegClass;
2559 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDY;
2562 RC = &AMDGPU::SReg_32RegClass;
2566 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDZ;
2569 RC = &AMDGPU::SReg_32RegClass;
2574 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2575 Reg = &ClusterWorkGroupMaxIDX;
2576 RC = &AMDGPU::SReg_32RegClass;
2581 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2582 Reg = &ClusterWorkGroupMaxIDY;
2583 RC = &AMDGPU::SReg_32RegClass;
2588 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2589 Reg = &ClusterWorkGroupMaxIDZ;
2590 RC = &AMDGPU::SReg_32RegClass;
2594 Reg = &ClusterWorkGroupMaxFlatID;
2595 RC = &AMDGPU::SReg_32RegClass;
2626 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2630 "vector type argument should have been split");
2635 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2643 "unexpected vector split in ps argument type");
2657 Info->markPSInputAllocated(PSInputNum);
2659 Info->markPSInputEnabled(PSInputNum);
2675 if (Info.hasWorkItemIDX()) {
2681 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2685 if (Info.hasWorkItemIDY()) {
2686 assert(Info.hasWorkItemIDX());
2687 if (Subtarget->hasPackedTID()) {
2688 Info.setWorkItemIDY(
2691 unsigned Reg = AMDGPU::VGPR1;
2699 if (Info.hasWorkItemIDZ()) {
2700 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2701 if (Subtarget->hasPackedTID()) {
2702 Info.setWorkItemIDZ(
2705 unsigned Reg = AMDGPU::VGPR2;
2725 if (RegIdx == ArgVGPRs.
size()) {
2732 unsigned Reg = ArgVGPRs[RegIdx];
2744 unsigned NumArgRegs) {
2747 if (RegIdx == ArgSGPRs.
size())
2750 unsigned Reg = ArgSGPRs[RegIdx];
2792 const unsigned Mask = 0x3ff;
2795 if (Info.hasWorkItemIDX()) {
2797 Info.setWorkItemIDX(Arg);
2800 if (Info.hasWorkItemIDY()) {
2802 Info.setWorkItemIDY(Arg);
2805 if (Info.hasWorkItemIDZ())
2817 const unsigned Mask = 0x3ff;
2826 auto &
ArgInfo = Info.getArgInfo();
2838 if (Info.hasImplicitArgPtr())
2846 if (Info.hasWorkGroupIDX())
2849 if (Info.hasWorkGroupIDY())
2852 if (Info.hasWorkGroupIDZ())
2855 if (Info.hasLDSKernelId())
2866 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2867 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2873 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2874 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2879 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2880 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2886 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2892 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2901 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2906 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2907 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2912 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2913 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2928 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2930 bool InPreloadSequence =
true;
2932 bool AlignedForImplictArgs =
false;
2933 unsigned ImplicitArgOffset = 0;
2934 for (
auto &Arg :
F.args()) {
2935 if (!InPreloadSequence || !Arg.hasInRegAttr())
2938 unsigned ArgIdx = Arg.getArgNo();
2941 if (InIdx < Ins.
size() &&
2942 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2945 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
2946 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2948 assert(ArgLocs[ArgIdx].isMemLoc());
2949 auto &ArgLoc = ArgLocs[InIdx];
2951 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2953 unsigned NumAllocSGPRs =
2954 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2957 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2958 if (!AlignedForImplictArgs) {
2960 alignTo(LastExplicitArgOffset,
2961 Subtarget->getAlignmentForImplicitArgPtr()) -
2962 LastExplicitArgOffset;
2963 AlignedForImplictArgs =
true;
2965 ArgOffset += ImplicitArgOffset;
2969 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2970 assert(InIdx >= 1 &&
"No previous SGPR");
2971 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2972 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2976 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2977 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2980 InPreloadSequence =
false;
2986 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2988 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2990 if (PreloadRegs->
size() > 1)
2991 RC = &AMDGPU::SGPR_32RegClass;
2992 for (
auto &Reg : *PreloadRegs) {
2998 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3007 if (Info.hasLDSKernelId()) {
3008 Register Reg = Info.addLDSKernelId();
3009 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3018 bool IsShader)
const {
3019 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3020 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3026 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3028 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3032 unsigned NumRequiredSystemSGPRs =
3033 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3034 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3035 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3036 Register Reg = Info.addReservedUserSGPR();
3037 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3042 if (!HasArchitectedSGPRs) {
3043 if (Info.hasWorkGroupIDX()) {
3044 Register Reg = Info.addWorkGroupIDX();
3045 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3049 if (Info.hasWorkGroupIDY()) {
3050 Register Reg = Info.addWorkGroupIDY();
3051 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3055 if (Info.hasWorkGroupIDZ()) {
3056 Register Reg = Info.addWorkGroupIDZ();
3057 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3062 if (Info.hasWorkGroupInfo()) {
3063 Register Reg = Info.addWorkGroupInfo();
3064 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3068 if (Info.hasPrivateSegmentWaveByteOffset()) {
3070 unsigned PrivateSegmentWaveByteOffsetReg;
3073 PrivateSegmentWaveByteOffsetReg =
3074 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3078 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3080 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3083 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3085 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3086 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3089 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3090 Info.getNumPreloadedSGPRs() >= 16);
3105 if (HasStackObjects)
3106 Info.setHasNonSpillStackObjects(
true);
3111 HasStackObjects =
true;
3115 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3117 if (!ST.enableFlatScratch()) {
3118 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3125 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3127 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3137 Info.setScratchRSrcReg(ReservedBufferReg);
3156 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3157 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3164 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3165 if (!
MRI.isLiveIn(
Reg)) {
3166 Info.setStackPtrOffsetReg(
Reg);
3171 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3178 if (ST.getFrameLowering()->hasFP(MF)) {
3179 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3195 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3204 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3205 RC = &AMDGPU::SGPR_64RegClass;
3206 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3207 RC = &AMDGPU::SGPR_32RegClass;
3213 Entry->addLiveIn(*
I);
3218 for (
auto *Exit : Exits)
3220 TII->get(TargetOpcode::COPY), *
I)
3235 bool IsError =
false;
3239 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3257 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3258 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3260 if (!Subtarget->enableFlatScratch())
3265 !Subtarget->hasArchitectedSGPRs())
3266 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3267 !Info->hasWorkGroupIDZ());
3270 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3288 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3289 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3292 Info->markPSInputAllocated(0);
3293 Info->markPSInputEnabled(0);
3295 if (Subtarget->isAmdPalOS()) {
3304 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3305 if ((PsInputBits & 0x7F) == 0 ||
3306 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3309 }
else if (IsKernel) {
3310 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3322 if (IsKernel && Subtarget->hasKernargPreload())
3326 }
else if (!IsGraphics) {
3331 if (!Subtarget->enableFlatScratch())
3343 Info->setNumWaveDispatchSGPRs(
3345 Info->setNumWaveDispatchVGPRs(
3347 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3348 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3353 if (IsWholeWaveFunc) {
3355 {MVT::i1, MVT::Other}, Chain);
3367 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3378 if (IsEntryFunc && VA.
isMemLoc()) {
3401 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3405 int64_t OffsetDiff =
Offset - AlignDownOffset;
3412 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3423 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3424 Ins[i].Flags.isSExt(), &Ins[i]);
3432 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3435 if (PreloadRegs.
size() == 1) {
3436 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3441 TRI->getRegSizeInBits(*RC)));
3449 for (
auto Reg : PreloadRegs) {
3456 PreloadRegs.size()),
3473 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3474 Ins[i].Flags.isSExt(), &Ins[i]);
3486 "hidden argument in kernel signature was not preloaded",
3492 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3493 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3513 if (!IsEntryFunc && VA.
isMemLoc()) {
3514 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3525 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3526 RC = &AMDGPU::VGPR_32RegClass;
3527 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3528 RC = &AMDGPU::SGPR_32RegClass;
3548 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3557 auto &ArgUsageInfo =
3560 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3562 auto *ArgUsageInfo =
3564 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3566 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3570 Info->setBytesInStackArgArea(StackArgSize);
3572 return Chains.
empty() ? Chain
3581 const Type *RetTy)
const {
3589 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3594 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3595 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3596 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3597 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3620 Info->setIfReturnsVoid(Outs.
empty());
3621 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3640 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3641 ++
I, ++RealRVLocIdx) {
3645 SDValue Arg = OutVals[RealRVLocIdx];
3668 ReadFirstLane, Arg);
3675 if (!Info->isEntryFunction()) {
3681 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3683 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3696 unsigned Opc = AMDGPUISD::ENDPGM;
3698 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3699 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3700 : AMDGPUISD::RET_GLUE;
3782 auto &ArgUsageInfo =
3785 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3786 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3788 auto *ArgUsageInfo =
3793 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3821 const auto [OutgoingArg, ArgRC, ArgTy] =
3826 const auto [IncomingArg, IncomingArgRC, Ty] =
3828 assert(IncomingArgRC == ArgRC);
3831 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3839 InputReg = getImplicitArgPtr(DAG,
DL);
3841 std::optional<uint32_t> Id =
3843 if (Id.has_value()) {
3854 if (OutgoingArg->isRegister()) {
3855 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3856 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3859 unsigned SpecialArgOffset =
3870 auto [OutgoingArg, ArgRC, Ty] =
3873 std::tie(OutgoingArg, ArgRC, Ty) =
3876 std::tie(OutgoingArg, ArgRC, Ty) =
3891 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3892 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3893 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3898 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3906 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3916 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3925 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3926 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3937 : IncomingArgY ? *IncomingArgY
3944 if (OutgoingArg->isRegister()) {
3946 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3972 if (Callee->isDivergent())
3979 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3983 if (!CallerPreserved)
3986 bool CCMatch = CallerCC == CalleeCC;
3999 if (Arg.hasByValAttr())
4013 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4014 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4023 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4036 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4038 if (!CCVA.isRegLoc())
4043 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4045 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4069enum ChainCallArgIdx {
4091 bool UsesDynamicVGPRs =
false;
4092 if (IsChainCallConv) {
4097 auto RequestedExecIt =
4099 return Arg.OrigArgIndex == 2;
4101 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4103 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4106 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4109 "Haven't popped all the special args");
4112 CLI.
Args[ChainCallArgIdx::Exec];
4113 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4121 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4123 ChainCallSpecialArgs.
push_back(Arg.Node);
4126 PushNodeOrTargetConstant(RequestedExecArg);
4132 if (FlagsValue.
isZero()) {
4133 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4135 "no additional args allowed if flags == 0");
4137 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4141 if (!Subtarget->isWave32()) {
4143 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4146 UsesDynamicVGPRs =
true;
4147 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4148 CLI.
Args.end(), PushNodeOrTargetConstant);
4157 bool IsSibCall =
false;
4171 "unsupported call to variadic function ");
4179 "unsupported required tail call to function ");
4184 Outs, OutVals, Ins, DAG);
4188 "site marked musttail or on llvm.amdgcn.cs.chain");
4195 if (!TailCallOpt && IsTailCall)
4235 auto *
TRI = Subtarget->getRegisterInfo();
4242 if (!IsSibCall || IsChainCallConv) {
4243 if (!Subtarget->enableFlatScratch()) {
4249 RegsToPass.emplace_back(IsChainCallConv
4250 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4251 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4258 const unsigned NumSpecialInputs = RegsToPass.size();
4260 MVT PtrVT = MVT::i32;
4263 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4291 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4299 int32_t
Offset = LocMemOffset;
4306 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4312 ? Flags.getNonZeroByValAlign()
4339 if (Outs[i].Flags.isByVal()) {
4341 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4344 Outs[i].Flags.getNonZeroByValAlign(),
4346 nullptr, std::nullopt, DstInfo,
4352 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4358 if (!MemOpChains.
empty())
4374 unsigned ArgIdx = 0;
4375 for (
auto [Reg, Val] : RegsToPass) {
4376 if (ArgIdx++ >= NumSpecialInputs &&
4377 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4403 if (IsTailCall && !IsSibCall) {
4408 std::vector<SDValue>
Ops({Chain});
4414 Ops.push_back(Callee);
4431 Ops.push_back(Callee);
4442 if (IsChainCallConv)
4447 for (
auto &[Reg, Val] : RegsToPass)
4451 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4452 assert(Mask &&
"Missing call preserved mask for calling convention");
4462 MVT::Glue, GlueOps),
4467 Ops.push_back(InGlue);
4473 unsigned OPC = AMDGPUISD::TC_RETURN;
4476 OPC = AMDGPUISD::TC_RETURN_GFX;
4480 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4481 : AMDGPUISD::TC_RETURN_CHAIN;
4487 if (Info->isWholeWaveFunction())
4488 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4495 Chain =
Call.getValue(0);
4496 InGlue =
Call.getValue(1);
4498 uint64_t CalleePopBytes = NumBytes;
4519 EVT VT =
Op.getValueType();
4533 "Stack grows upwards for AMDGPU");
4535 Chain = BaseAddr.getValue(1);
4537 if (Alignment > StackAlign) {
4539 << Subtarget->getWavefrontSizeLog2();
4540 uint64_t StackAlignMask = ScaledAlignment - 1;
4547 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4553 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4564 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4580 if (
Op.getValueType() != MVT::i32)
4599 assert(
Op.getValueType() == MVT::i32);
4608 Op.getOperand(0), IntrinID, GetRoundBothImm);
4642 SDValue RoundModeTimesNumBits =
4662 TableEntry, EnumOffset);
4678 static_cast<uint32_t>(ConstMode->getZExtValue()),
4690 if (UseReducedTable) {
4696 SDValue RoundModeTimesNumBits =
4716 SDValue RoundModeTimesNumBits =
4725 NewMode = TruncTable;
4734 ReadFirstLaneID, NewMode);
4747 IntrinID, RoundBothImm, NewMode);
4753 if (
Op->isDivergent() &&
4754 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4764 if (Subtarget->hasSafeSmemPrefetch())
4772 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4781 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4782 EVT SrcVT = Src.getValueType();
4791 EVT DstVT =
Op.getValueType();
4800 if (
Op.getValueType() != MVT::i64)
4814 Op.getOperand(0), IntrinID, ModeHwRegImm);
4816 Op.getOperand(0), IntrinID, TrapHwRegImm);
4830 if (
Op.getOperand(1).getValueType() != MVT::i64)
4842 ReadFirstLaneID, NewModeReg);
4844 ReadFirstLaneID, NewTrapReg);
4846 unsigned ModeHwReg =
4849 unsigned TrapHwReg =
4857 IntrinID, ModeHwRegImm, NewModeReg);
4860 IntrinID, TrapHwRegImm, NewTrapReg);
4869 .
Case(
"m0", AMDGPU::M0)
4870 .
Case(
"exec", AMDGPU::EXEC)
4871 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4872 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4873 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4874 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4875 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4880 if (!Subtarget->hasFlatScrRegister() &&
4881 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4883 "\" for subtarget."));
4888 case AMDGPU::EXEC_LO:
4889 case AMDGPU::EXEC_HI:
4890 case AMDGPU::FLAT_SCR_LO:
4891 case AMDGPU::FLAT_SCR_HI:
4896 case AMDGPU::FLAT_SCR:
4915 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4924static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4946 auto Next = std::next(
I);
4957 MBB.addSuccessor(LoopBB);
4959 return std::pair(LoopBB, RemainderBB);
4966 auto I =
MI.getIterator();
4967 auto E = std::next(
I);
4989 Src->setIsKill(
false);
4999 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5005 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5008 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5032 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5033 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5043 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5044 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5046 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5047 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5055 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5062 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5066 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5074 MRI.setSimpleHint(NewExec, CondReg);
5076 if (UseGPRIdxMode) {
5078 SGPRIdxReg = CurrentIdxReg;
5080 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5081 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5091 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5122 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5123 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5131 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5133 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5134 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5150 InitResultReg, DstReg, PhiReg, TmpExec,
5151 Offset, UseGPRIdxMode, SGPRIdxReg);
5157 LoopBB->removeSuccessor(RemainderBB);
5159 LoopBB->addSuccessor(LandingPad);
5170static std::pair<unsigned, int>
5174 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5179 return std::pair(AMDGPU::sub0,
Offset);
5219 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5236 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5237 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5246 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5249 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5253 if (UseGPRIdxMode) {
5260 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5273 MI.eraseFromParent();
5282 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5283 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5289 UseGPRIdxMode, SGPRIdxReg);
5293 if (UseGPRIdxMode) {
5295 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5297 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5302 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5307 MI.eraseFromParent();
5324 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5334 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5336 if (Idx->
getReg() == AMDGPU::NoRegister) {
5347 MI.eraseFromParent();
5352 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5356 if (UseGPRIdxMode) {
5360 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5369 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5370 TRI.getRegSizeInBits(*VecRC), 32,
false);
5376 MI.eraseFromParent();
5386 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5390 UseGPRIdxMode, SGPRIdxReg);
5393 if (UseGPRIdxMode) {
5395 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5397 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5403 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5404 TRI.getRegSizeInBits(*VecRC), 32,
false);
5405 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5411 MI.eraseFromParent();
5427 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5428 if (ST.hasScalarAddSub64()) {
5429 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5439 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5440 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5443 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5445 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5448 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5450 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5452 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5453 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5462 MI.eraseFromParent();
5468 case AMDGPU::S_MIN_U32:
5469 return std::numeric_limits<uint32_t>::max();
5470 case AMDGPU::S_MIN_I32:
5471 return std::numeric_limits<int32_t>::max();
5472 case AMDGPU::S_MAX_U32:
5473 return std::numeric_limits<uint32_t>::min();
5474 case AMDGPU::S_MAX_I32:
5475 return std::numeric_limits<int32_t>::min();
5476 case AMDGPU::V_ADD_F32_e64:
5478 case AMDGPU::V_SUB_F32_e64:
5480 case AMDGPU::S_ADD_I32:
5481 case AMDGPU::S_SUB_I32:
5482 case AMDGPU::S_OR_B32:
5483 case AMDGPU::S_XOR_B32:
5484 return std::numeric_limits<uint32_t>::min();
5485 case AMDGPU::S_AND_B32:
5486 return std::numeric_limits<uint32_t>::max();
5487 case AMDGPU::V_MIN_F32_e64:
5488 case AMDGPU::V_MAX_F32_e64:
5492 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5498 case AMDGPU::V_CMP_LT_U64_e64:
5499 return std::numeric_limits<uint64_t>::max();
5500 case AMDGPU::V_CMP_LT_I64_e64:
5501 return std::numeric_limits<int64_t>::max();
5502 case AMDGPU::V_CMP_GT_U64_e64:
5503 return std::numeric_limits<uint64_t>::min();
5504 case AMDGPU::V_CMP_GT_I64_e64:
5505 return std::numeric_limits<int64_t>::min();
5506 case AMDGPU::S_ADD_U64_PSEUDO:
5507 case AMDGPU::S_SUB_U64_PSEUDO:
5508 case AMDGPU::S_OR_B64:
5509 case AMDGPU::S_XOR_B64:
5510 return std::numeric_limits<uint64_t>::min();
5511 case AMDGPU::S_AND_B64:
5512 return std::numeric_limits<uint64_t>::max();
5515 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5520 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5521 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5522 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5523 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5524 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5525 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5526 Opc == AMDGPU::V_SUB_F32_e64;
5530 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5531 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64;
5545 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5550 case AMDGPU::S_MIN_U32:
5551 case AMDGPU::S_MIN_I32:
5552 case AMDGPU::V_MIN_F32_e64:
5553 case AMDGPU::S_MAX_U32:
5554 case AMDGPU::S_MAX_I32:
5555 case AMDGPU::V_MAX_F32_e64:
5556 case AMDGPU::S_AND_B32:
5557 case AMDGPU::S_OR_B32: {
5563 case AMDGPU::V_CMP_LT_U64_e64:
5564 case AMDGPU::V_CMP_LT_I64_e64:
5565 case AMDGPU::V_CMP_GT_U64_e64:
5566 case AMDGPU::V_CMP_GT_I64_e64:
5567 case AMDGPU::S_AND_B64:
5568 case AMDGPU::S_OR_B64: {
5574 case AMDGPU::S_XOR_B32:
5575 case AMDGPU::S_XOR_B64:
5576 case AMDGPU::S_ADD_I32:
5577 case AMDGPU::S_ADD_U64_PSEUDO:
5578 case AMDGPU::V_ADD_F32_e64:
5579 case AMDGPU::S_SUB_I32:
5580 case AMDGPU::S_SUB_U64_PSEUDO:
5581 case AMDGPU::V_SUB_F32_e64: {
5584 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5586 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5588 bool IsWave32 = ST.isWave32();
5589 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5590 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5591 unsigned BitCountOpc =
5592 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5596 auto NewAccumulator =
5601 case AMDGPU::S_XOR_B32:
5602 case AMDGPU::S_XOR_B64: {
5608 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5611 .
addReg(NewAccumulator->getOperand(0).getReg())
5614 if (
Opc == AMDGPU::S_XOR_B32) {
5620 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5622 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5626 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5629 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5631 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5641 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5649 case AMDGPU::S_SUB_I32: {
5650 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5658 .
addReg(NewAccumulator->getOperand(0).getReg());
5661 case AMDGPU::S_ADD_I32: {
5664 .
addReg(NewAccumulator->getOperand(0).getReg());
5667 case AMDGPU::S_ADD_U64_PSEUDO:
5668 case AMDGPU::S_SUB_U64_PSEUDO: {
5669 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5670 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5672 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5674 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5675 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5676 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5678 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5680 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5684 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5687 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5689 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5691 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5694 .
addReg(NewAccumulator->getOperand(0).getReg())
5704 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5706 : NewAccumulator->getOperand(0).getReg();
5717 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5723 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5729 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5736 case AMDGPU::V_ADD_F32_e64:
5737 case AMDGPU::V_SUB_F32_e64: {
5739 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5740 Register DstVreg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5744 .
addReg(NewAccumulator->getOperand(0).getReg())
5749 unsigned srcMod =
Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5757 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5786 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5787 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5788 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5789 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5790 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5791 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5792 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5794 bool IsWave32 = ST.isWave32();
5795 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5796 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5803 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5807 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5816 I = ComputeLoop->begin();
5818 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5822 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5826 I = ComputeLoop->end();
5829 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5833 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5839 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5840 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5842 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
5852 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5853 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5862 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5864 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5865 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5868 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5870 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5872 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5874 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5878 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5882 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5883 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5889 case AMDGPU::S_OR_B64:
5890 case AMDGPU::S_AND_B64:
5891 case AMDGPU::S_XOR_B64: {
5894 .
addReg(LaneValue->getOperand(0).getReg())
5898 case AMDGPU::V_CMP_GT_I64_e64:
5899 case AMDGPU::V_CMP_GT_U64_e64:
5900 case AMDGPU::V_CMP_LT_I64_e64:
5901 case AMDGPU::V_CMP_LT_U64_e64: {
5902 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5904 MRI.createVirtualRegister(WaveMaskRegClass);
5907 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5908 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5911 VregClass, AMDGPU::sub0, VSubRegClass);
5914 VregClass, AMDGPU::sub1, VSubRegClass);
5915 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5922 .
addReg(LaneValue->getOperand(0).getReg())
5923 .
addReg(AccumulatorVReg);
5925 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5926 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5930 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5931 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5932 .
addReg(LaneValue->getOperand(0).getReg())
5936 case AMDGPU::S_ADD_U64_PSEUDO:
5937 case AMDGPU::S_SUB_U64_PSEUDO: {
5940 .
addReg(LaneValue->getOperand(0).getReg());
5947 unsigned BITSETOpc =
5948 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5949 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5955 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5958 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5960 .
addReg(NewActiveBitsReg)
5962 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5967 MI.eraseFromParent();
5982 switch (
MI.getOpcode()) {
5983 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5985 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5987 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5989 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5991 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
5993 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5995 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5997 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5999 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6001 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6003 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6005 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6007 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6009 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6011 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6013 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6015 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6017 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6019 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6021 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6023 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6025 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6027 case AMDGPU::S_UADDO_PSEUDO:
6028 case AMDGPU::S_USUBO_PSEUDO: {
6034 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6036 : AMDGPU::S_SUB_U32;
6044 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6047 MI.eraseFromParent();
6050 case AMDGPU::S_ADD_U64_PSEUDO:
6051 case AMDGPU::S_SUB_U64_PSEUDO: {
6054 case AMDGPU::V_ADD_U64_PSEUDO:
6055 case AMDGPU::V_SUB_U64_PSEUDO: {
6056 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6062 if (ST.hasAddSubU64Insts()) {
6064 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6065 : AMDGPU::V_SUB_U64_e64),
6070 TII->legalizeOperands(*
I);
6071 MI.eraseFromParent();
6075 if (IsAdd && ST.hasLshlAddU64Inst()) {
6081 TII->legalizeOperands(*
Add);
6082 MI.eraseFromParent();
6086 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6088 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6089 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6091 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6092 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6096 : &AMDGPU::VReg_64RegClass;
6099 : &AMDGPU::VReg_64RegClass;
6102 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6104 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6107 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6109 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6112 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6114 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6117 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6124 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6138 TII->legalizeOperands(*LoHalf);
6139 TII->legalizeOperands(*HiHalf);
6140 MI.eraseFromParent();
6143 case AMDGPU::S_ADD_CO_PSEUDO:
6144 case AMDGPU::S_SUB_CO_PSEUDO: {
6155 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6156 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6161 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6162 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6166 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6168 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6173 if (ST.isWave64()) {
6174 if (ST.hasScalarCompareEq64()) {
6181 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6183 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6185 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6186 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6188 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6202 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6203 ? AMDGPU::S_ADDC_U32
6204 : AMDGPU::S_SUBB_U32;
6209 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6215 MI.eraseFromParent();
6218 case AMDGPU::SI_INIT_M0: {
6221 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6224 MI.eraseFromParent();
6227 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6230 TII->get(AMDGPU::S_CMP_EQ_U32))
6235 case AMDGPU::GET_GROUPSTATICSIZE: {
6239 .
add(
MI.getOperand(0))
6241 MI.eraseFromParent();
6244 case AMDGPU::GET_SHADERCYCLESHILO: {
6257 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6259 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6260 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6262 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6263 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6265 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6269 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6274 .
add(
MI.getOperand(0))
6279 MI.eraseFromParent();
6282 case AMDGPU::SI_INDIRECT_SRC_V1:
6283 case AMDGPU::SI_INDIRECT_SRC_V2:
6284 case AMDGPU::SI_INDIRECT_SRC_V3:
6285 case AMDGPU::SI_INDIRECT_SRC_V4:
6286 case AMDGPU::SI_INDIRECT_SRC_V5:
6287 case AMDGPU::SI_INDIRECT_SRC_V6:
6288 case AMDGPU::SI_INDIRECT_SRC_V7:
6289 case AMDGPU::SI_INDIRECT_SRC_V8:
6290 case AMDGPU::SI_INDIRECT_SRC_V9:
6291 case AMDGPU::SI_INDIRECT_SRC_V10:
6292 case AMDGPU::SI_INDIRECT_SRC_V11:
6293 case AMDGPU::SI_INDIRECT_SRC_V12:
6294 case AMDGPU::SI_INDIRECT_SRC_V16:
6295 case AMDGPU::SI_INDIRECT_SRC_V32:
6297 case AMDGPU::SI_INDIRECT_DST_V1:
6298 case AMDGPU::SI_INDIRECT_DST_V2:
6299 case AMDGPU::SI_INDIRECT_DST_V3:
6300 case AMDGPU::SI_INDIRECT_DST_V4:
6301 case AMDGPU::SI_INDIRECT_DST_V5:
6302 case AMDGPU::SI_INDIRECT_DST_V6:
6303 case AMDGPU::SI_INDIRECT_DST_V7:
6304 case AMDGPU::SI_INDIRECT_DST_V8:
6305 case AMDGPU::SI_INDIRECT_DST_V9:
6306 case AMDGPU::SI_INDIRECT_DST_V10:
6307 case AMDGPU::SI_INDIRECT_DST_V11:
6308 case AMDGPU::SI_INDIRECT_DST_V12:
6309 case AMDGPU::SI_INDIRECT_DST_V16:
6310 case AMDGPU::SI_INDIRECT_DST_V32:
6312 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6313 case AMDGPU::SI_KILL_I1_PSEUDO:
6315 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6319 Register SrcCond =
MI.getOperand(3).getReg();
6321 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6322 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6323 const auto *CondRC =
TRI->getWaveMaskRegClass();
6324 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6328 : &AMDGPU::VReg_64RegClass;
6331 : &AMDGPU::VReg_64RegClass;
6334 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6336 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6339 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6341 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6344 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6346 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6367 MI.eraseFromParent();
6370 case AMDGPU::SI_BR_UNDEF: {
6372 .
add(
MI.getOperand(0));
6374 MI.eraseFromParent();
6377 case AMDGPU::ADJCALLSTACKUP:
6378 case AMDGPU::ADJCALLSTACKDOWN: {
6385 case AMDGPU::SI_CALL_ISEL: {
6386 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6389 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6395 MI.eraseFromParent();
6398 case AMDGPU::V_ADD_CO_U32_e32:
6399 case AMDGPU::V_SUB_CO_U32_e32:
6400 case AMDGPU::V_SUBREV_CO_U32_e32: {
6402 unsigned Opc =
MI.getOpcode();
6404 bool NeedClampOperand =
false;
6405 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6407 NeedClampOperand =
true;
6411 if (
TII->isVOP3(*
I)) {
6414 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6415 if (NeedClampOperand)
6418 TII->legalizeOperands(*
I);
6420 MI.eraseFromParent();
6423 case AMDGPU::V_ADDC_U32_e32:
6424 case AMDGPU::V_SUBB_U32_e32:
6425 case AMDGPU::V_SUBBREV_U32_e32:
6428 TII->legalizeOperands(
MI);
6430 case AMDGPU::DS_GWS_INIT:
6431 case AMDGPU::DS_GWS_SEMA_BR:
6432 case AMDGPU::DS_GWS_BARRIER:
6433 case AMDGPU::DS_GWS_SEMA_V:
6434 case AMDGPU::DS_GWS_SEMA_P:
6435 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6443 case AMDGPU::S_SETREG_B32: {
6459 const unsigned SetMask = WidthMask <<
Offset;
6462 unsigned SetDenormOp = 0;
6463 unsigned SetRoundOp = 0;
6471 SetRoundOp = AMDGPU::S_ROUND_MODE;
6472 SetDenormOp = AMDGPU::S_DENORM_MODE;
6474 SetRoundOp = AMDGPU::S_ROUND_MODE;
6476 SetDenormOp = AMDGPU::S_DENORM_MODE;
6479 if (SetRoundOp || SetDenormOp) {
6481 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6482 unsigned ImmVal = Def->getOperand(1).getImm();
6496 MI.eraseFromParent();
6505 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6509 case AMDGPU::S_INVERSE_BALLOT_U32:
6510 case AMDGPU::S_INVERSE_BALLOT_U64:
6513 MI.setDesc(
TII->get(AMDGPU::COPY));
6515 case AMDGPU::ENDPGM_TRAP: {
6517 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6537 MI.eraseFromParent();
6540 case AMDGPU::SIMULATED_TRAP: {
6541 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6543 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6544 MI.eraseFromParent();
6547 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6548 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6554 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6555 Register OriginalExec = Setup->getOperand(0).getReg();
6557 MI.getOperand(0).setReg(OriginalExec);
6594 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6598 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6625 if (!Subtarget->hasMadMacF32Insts())
6626 return Subtarget->hasFastFMAF32();
6632 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6635 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6651 switch (Ty.getScalarSizeInBits()) {
6669 if (Ty.getScalarSizeInBits() == 16)
6671 if (Ty.getScalarSizeInBits() == 32)
6672 return Subtarget->hasMadMacF32Insts() &&
6682 EVT VT =
N->getValueType(0);
6684 return Subtarget->hasMadMacF32Insts() &&
6686 if (VT == MVT::f16) {
6687 return Subtarget->hasMadF16() &&
6702 unsigned Opc =
Op.getOpcode();
6703 EVT VT =
Op.getValueType();
6704 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6705 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6706 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6707 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6708 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6709 VT == MVT::v32bf16);
6725 [[maybe_unused]]
EVT VT =
Op.getValueType();
6727 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6728 VT == MVT::v16i32) &&
6729 "Unexpected ValueType.");
6738 unsigned Opc =
Op.getOpcode();
6739 EVT VT =
Op.getValueType();
6740 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6741 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6742 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6743 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6744 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6745 VT == MVT::v32bf16);
6753 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6755 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6762 unsigned Opc =
Op.getOpcode();
6763 EVT VT =
Op.getValueType();
6764 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6765 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6766 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6767 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6768 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6769 VT == MVT::v32bf16);
6774 : std::pair(Op0, Op0);
6783 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6785 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6791 switch (
Op.getOpcode()) {
6795 return LowerBRCOND(
Op, DAG);
6797 return LowerRETURNADDR(
Op, DAG);
6800 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6801 "Load should return a value and a chain");
6805 EVT VT =
Op.getValueType();
6807 return lowerFSQRTF32(
Op, DAG);
6809 return lowerFSQRTF64(
Op, DAG);
6814 return LowerTrig(
Op, DAG);
6816 return LowerSELECT(
Op, DAG);
6818 return LowerFDIV(
Op, DAG);
6820 return LowerFFREXP(
Op, DAG);
6822 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6824 return LowerSTORE(
Op, DAG);
6828 return LowerGlobalAddress(MFI,
Op, DAG);
6831 return LowerExternalSymbol(
Op, DAG);
6833 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6835 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6837 return LowerINTRINSIC_VOID(
Op, DAG);
6839 return lowerADDRSPACECAST(
Op, DAG);
6841 return lowerINSERT_SUBVECTOR(
Op, DAG);
6843 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6845 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6847 return lowerVECTOR_SHUFFLE(
Op, DAG);
6849 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6851 return lowerBUILD_VECTOR(
Op, DAG);
6854 return lowerFP_ROUND(
Op, DAG);
6856 return lowerTRAP(
Op, DAG);
6858 return lowerDEBUGTRAP(
Op, DAG);
6867 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6870 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6873 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6876 return lowerFLDEXP(
Op, DAG);
6882 Op.getValueType() == MVT::i16 &&
6883 Op.getOperand(0).getValueType() == MVT::f32) {
6907 return lowerFCOPYSIGN(
Op, DAG);
6909 return lowerMUL(
Op, DAG);
6912 return lowerXMULO(
Op, DAG);
6915 return lowerXMUL_LOHI(
Op, DAG);
6950 EVT FittingLoadVT = LoadVT;
6982SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6985 bool IsIntrinsic)
const {
6988 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6989 EVT LoadVT =
M->getValueType(0);
6991 EVT EquivLoadVT = LoadVT;
7005 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7009 M->getMemoryVT(),
M->getMemOperand());
7020 EVT LoadVT =
M->getValueType(0);
7026 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7027 bool IsTFE =
M->getNumValues() == 3;
7029 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7030 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7031 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7032 : AMDGPUISD::BUFFER_LOAD;
7035 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7040 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7044 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7045 M->getMemOperand(), DAG);
7049 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7051 M->getMemOperand(), DAG);
7059 EVT VT =
N->getValueType(0);
7060 unsigned CondCode =
N->getConstantOperandVal(3);
7071 EVT CmpVT =
LHS.getValueType();
7072 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7073 unsigned PromoteOp =
7093 EVT VT =
N->getValueType(0);
7095 unsigned CondCode =
N->getConstantOperandVal(3);
7104 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7113 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7122 EVT VT =
N->getValueType(0);
7146 Exec = AMDGPU::EXEC_LO;
7148 Exec = AMDGPU::EXEC;
7165 EVT VT =
N->getValueType(0);
7167 unsigned IID =
N->getConstantOperandVal(0);
7168 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7169 IID == Intrinsic::amdgcn_permlanex16;
7170 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7171 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7175 unsigned SplitSize = 32;
7176 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7177 ST->hasDPALU_DPP() &&
7185 case Intrinsic::amdgcn_permlane16:
7186 case Intrinsic::amdgcn_permlanex16:
7187 case Intrinsic::amdgcn_update_dpp:
7192 case Intrinsic::amdgcn_writelane:
7195 case Intrinsic::amdgcn_readlane:
7196 case Intrinsic::amdgcn_set_inactive:
7197 case Intrinsic::amdgcn_set_inactive_chain_arg:
7198 case Intrinsic::amdgcn_mov_dpp8:
7201 case Intrinsic::amdgcn_readfirstlane:
7202 case Intrinsic::amdgcn_permlane64:
7210 std::reverse(Operands.
begin(), Operands.
end());
7212 if (
SDNode *GL =
N->getGluedNode()) {
7214 GL = GL->getOperand(0).getNode();
7224 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7225 IID == Intrinsic::amdgcn_mov_dpp8 ||
7226 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7227 Src1 =
N->getOperand(2);
7228 if (IID == Intrinsic::amdgcn_writelane ||
7229 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7230 Src2 =
N->getOperand(3);
7233 if (ValSize == SplitSize) {
7243 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7248 if (IID == Intrinsic::amdgcn_writelane) {
7253 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7255 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7258 if (ValSize % SplitSize != 0)
7262 EVT VT =
N->getValueType(0);
7266 unsigned NumOperands =
N->getNumOperands();
7268 SDNode *GL =
N->getGluedNode();
7273 for (
unsigned i = 0; i != NE; ++i) {
7274 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7276 SDValue Operand =
N->getOperand(j);
7285 Operands[j] = Operand;
7290 Operands[NumOperands - 1] =
7306 if (SplitSize == 32) {
7308 return unrollLaneOp(LaneOp.
getNode());
7314 unsigned SubVecNumElt =
7318 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7319 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7323 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7328 if (IID == Intrinsic::amdgcn_writelane)
7333 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7334 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7335 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7336 EltIdx += SubVecNumElt;
7350 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7353 if (IID == Intrinsic::amdgcn_writelane)
7356 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7363 EVT VT =
N->getValueType(0);
7381 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
7385 Operands.
append(IntrinArgs);
7391 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7392 {ShiftedIndex, ValueI32});
7402 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7403 {ValueI32, PoisonVal});
7404 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7405 {ShiftedIndex, PoisonVal});
7408 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7411 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7412 {WWMIndex, WWMValue});
7413 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7414 MVT::i32, {WWMIndex, Swapped});
7416 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7424 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7432 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
7442 switch (
N->getOpcode()) {
7454 unsigned IID =
N->getConstantOperandVal(0);
7456 case Intrinsic::amdgcn_make_buffer_rsrc:
7457 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7459 case Intrinsic::amdgcn_cvt_pkrtz: {
7464 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7468 case Intrinsic::amdgcn_cvt_pknorm_i16:
7469 case Intrinsic::amdgcn_cvt_pknorm_u16:
7470 case Intrinsic::amdgcn_cvt_pk_i16:
7471 case Intrinsic::amdgcn_cvt_pk_u16: {
7477 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7478 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7479 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7480 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7481 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7482 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7484 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7486 EVT VT =
N->getValueType(0);
7495 case Intrinsic::amdgcn_s_buffer_load: {
7501 if (!Subtarget->hasScalarSubwordLoads())
7507 EVT VT =
Op.getValueType();
7508 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7520 if (!
Offset->isDivergent()) {
7539 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7544 case Intrinsic::amdgcn_dead: {
7545 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7556 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7557 Results.push_back(Res.getOperand(
I));
7561 Results.push_back(Res.getValue(1));
7570 EVT VT =
N->getValueType(0);
7575 EVT SelectVT = NewVT;
7576 if (NewVT.
bitsLT(MVT::i32)) {
7579 SelectVT = MVT::i32;
7585 if (NewVT != SelectVT)
7591 if (
N->getValueType(0) != MVT::v2f16)
7603 if (
N->getValueType(0) != MVT::v2f16)
7615 if (
N->getValueType(0) != MVT::f16)
7630 if (U.get() !=
Value)
7633 if (U.getUser()->getOpcode() == Opcode)
7639unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7642 case Intrinsic::amdgcn_if:
7643 return AMDGPUISD::IF;
7644 case Intrinsic::amdgcn_else:
7645 return AMDGPUISD::ELSE;
7646 case Intrinsic::amdgcn_loop:
7647 return AMDGPUISD::LOOP;
7648 case Intrinsic::amdgcn_end_cf:
7668 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7695 SDNode *Intr = BRCOND.getOperand(1).getNode();
7712 Intr =
LHS.getNode();
7720 assert(BR &&
"brcond missing unconditional branch user");
7725 unsigned CFNode = isCFIntrinsic(Intr);
7745 Ops.push_back(Target);
7768 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7787 MVT VT =
Op.getSimpleValueType();
7790 if (
Op.getConstantOperandVal(0) != 0)
7794 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7796 if (
Info->isEntryFunction())
7813 return Op.getValueType().bitsLE(VT)
7821 EVT DstVT =
Op.getValueType();
7828 unsigned Opc =
Op.getOpcode();
7840 EVT SrcVT = Src.getValueType();
7841 EVT DstVT =
Op.getValueType();
7844 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7847 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7854 if (DstVT == MVT::f16) {
7859 if (!Subtarget->has16BitInsts()) {
7864 if (
Op->getFlags().hasApproximateFuncs()) {
7875 "custom lower FP_ROUND for f16 or bf16");
7876 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7888 EVT VT =
Op.getValueType();
7890 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7891 bool IsIEEEMode =
Info->getMode().IEEE;
7900 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7907SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7909 EVT VT =
Op.getValueType();
7911 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7912 bool IsIEEEMode =
Info->getMode().IEEE;
7917 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7925 EVT VT =
Op.getValueType();
7929 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7930 !Subtarget->hasMinimum3Maximum3F16() &&
7931 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7932 "should not need to widen f16 minimum/maximum to v2f16");
7946 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7954 EVT VT =
Op.getValueType();
7958 EVT ExpVT =
Exp.getValueType();
7959 if (ExpVT == MVT::i16)
7980 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7987 switch (
Op->getOpcode()) {
8017 DAGCombinerInfo &DCI)
const {
8018 const unsigned Opc =
Op.getOpcode();
8026 :
Op->getOperand(0).getValueType();
8027 auto &DAG = DCI.DAG;
8030 if (DCI.isBeforeLegalizeOps() ||
8038 LHS =
Op->getOperand(1);
8039 RHS =
Op->getOperand(2);
8041 LHS =
Op->getOperand(0);
8042 RHS =
Op->getOperand(1);
8081 if (MagVT == SignVT)
8098 EVT VT =
Op.getValueType();
8104 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8131 if (
Op->isDivergent())
8144 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8146 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8149 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8151 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8157 EVT VT =
Op.getValueType();
8164 const APInt &
C = RHSC->getAPIntValue();
8166 if (
C.isPowerOf2()) {
8168 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8195 if (
Op->isDivergent()) {
8199 if (Subtarget->hasSMulHi()) {
8210 if (!Subtarget->isTrapHandlerEnabled() ||
8212 return lowerTrapEndpgm(
Op, DAG);
8214 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8215 : lowerTrapHsaQueuePtr(
Op, DAG);
8221 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8225SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8227 ImplicitParameter Param)
const {
8231 MachinePointerInfo PtrInfo =
8248 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8251 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8254 if (UserSGPR == AMDGPU::NoRegister) {
8271 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8280 if (Subtarget->hasPrivEnabledTrap2NopBug())
8281 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8285 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8293 if (!Subtarget->isTrapHandlerEnabled() ||
8297 "debugtrap handler not supported",
8305 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8308SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8310 if (Subtarget->hasApertureRegs()) {
8312 ? AMDGPU::SRC_SHARED_BASE
8313 : AMDGPU::SRC_PRIVATE_BASE;
8314 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8315 !Subtarget->hasGloballyAddressableScratch()) &&
8316 "Cannot use src_private_base with globally addressable scratch!");
8337 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8341 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8343 if (UserSGPR == AMDGPU::NoRegister) {
8388 const AMDGPUTargetMachine &TM =
8391 unsigned DestAS, SrcAS;
8393 bool IsNonNull =
false;
8395 SrcAS = ASC->getSrcAddressSpace();
8396 Src = ASC->getOperand(0);
8397 DestAS = ASC->getDestAddressSpace();
8400 Op.getConstantOperandVal(0) ==
8401 Intrinsic::amdgcn_addrspacecast_nonnull);
8402 Src =
Op->getOperand(1);
8403 SrcAS =
Op->getConstantOperandVal(2);
8404 DestAS =
Op->getConstantOperandVal(3);
8417 Subtarget->hasGloballyAddressableScratch()) {
8422 AMDGPU::S_MOV_B32, SL, MVT::i32,
8423 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8431 unsigned NullVal = TM.getNullPointerValue(DestAS);
8446 Subtarget->hasGloballyAddressableScratch()) {
8455 if (Subtarget->isWave64())
8461 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8469 AMDGPU::S_MOV_B64, SL, MVT::i64,
8470 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8472 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8474 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8482 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8494 Op.getValueType() == MVT::i64) {
8495 const SIMachineFunctionInfo *
Info =
8497 if (
Info->get32BitAddressHighBits() == 0)
8506 Src.getValueType() == MVT::i64)
8534 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8539 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8541 MVT::i32, InsNumElts / 2);
8546 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8548 if (InsNumElts == 2) {
8561 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8584 if (NumElts == 4 && EltSize == 16 && KIdx) {
8595 unsigned Idx = KIdx->getZExtValue();
8596 bool InsertLo = Idx < 2;
8600 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8606 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8619 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8654 EVT ResultVT =
Op.getValueType();
8667 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8670 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8674 if (VecSize == 128) {
8682 }
else if (VecSize == 256) {
8685 for (
unsigned P = 0;
P < 4; ++
P) {
8691 Parts[0], Parts[1]));
8693 Parts[2], Parts[3]));
8699 for (
unsigned P = 0;
P < 8; ++
P) {
8706 Parts[0], Parts[1], Parts[2], Parts[3]));
8709 Parts[4], Parts[5], Parts[6], Parts[7]));
8729 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8744 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8754 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8759 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8760 !(Mask[Elt + 1] & 1);
8766 EVT ResultVT =
Op.getValueType();
8769 const int NewSrcNumElts = 2;
8771 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8787 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8809 if (ShouldUseConsecutiveExtract &&
8812 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8813 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8825 if (Idx0 >= SrcNumElts) {
8830 if (Idx1 >= SrcNumElts) {
8835 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8836 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8844 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8845 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8850 if (SubVec0 != SubVec1) {
8851 NewMaskIdx1 += NewSrcNumElts;
8858 {NewMaskIdx0, NewMaskIdx1});
8863 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8864 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8865 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8866 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8885 EVT ResultVT =
Op.getValueType();
8901 EVT VT =
Op.getValueType();
8903 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8904 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8938 for (
unsigned P = 0;
P < NumParts; ++
P) {
8940 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8959 if (!Subtarget->isAmdHsaOS())
9002 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9011 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9019 EVT PtrVT =
Op.getValueType();
9021 const GlobalValue *GV = GSD->
getGlobal();
9035 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9050 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9053 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9054 if (Subtarget->has64BitLiterals()) {
9085 MachinePointerInfo PtrInfo =
9098 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9122 SDValue Param = lowerKernargMemParameter(
9133 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9141 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9149 unsigned NumElts = Elts.
size();
9151 if (NumElts <= 12) {
9160 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9166 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9176 EVT SrcVT = Src.getValueType();
9197 bool Unpacked,
bool IsD16,
int DMaskPop,
9198 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9202 EVT ReqRetVT = ResultTypes[0];
9204 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9205 ? (ReqRetNumElts + 1) / 2
9208 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9219 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9230 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9232 NumDataDwords - MaskPopDwords);
9237 EVT LegalReqRetVT = ReqRetVT;
9239 if (!
Data.getValueType().isInteger())
9241 Data.getValueType().changeTypeToInteger(),
Data);
9262 if (Result->getNumValues() == 1)
9269 SDValue *LWE,
bool &IsTexFail) {
9289 unsigned DimIdx,
unsigned EndIdx,
9290 unsigned NumGradients) {
9292 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9300 if (((
I + 1) >= EndIdx) ||
9301 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9302 I == DimIdx + NumGradients - 1))) {
9324 !
Op.getNode()->hasAnyUseOfValue(0))
9326 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9336 ResultTypes.erase(&ResultTypes[0]);
9342 int NumVDataDwords = 0;
9343 bool AdjustRetType =
false;
9344 bool IsAtomicPacked16Bit =
false;
9347 const unsigned ArgOffset = WithChain ? 2 : 1;
9350 unsigned DMaskLanes = 0;
9352 if (BaseOpcode->
Atomic) {
9353 VData =
Op.getOperand(2);
9355 IsAtomicPacked16Bit =
9356 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9357 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9358 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9359 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9370 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9372 DMask = Is64Bit ? 0xf : 0x3;
9373 NumVDataDwords = Is64Bit ? 4 : 2;
9375 DMask = Is64Bit ? 0x3 : 0x1;
9376 NumVDataDwords = Is64Bit ? 2 : 1;
9379 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9382 if (BaseOpcode->
Store) {
9383 VData =
Op.getOperand(2);
9387 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9391 VData = handleD16VData(VData, DAG,
true);
9394 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9395 }
else if (!BaseOpcode->
NoReturn) {
9400 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9408 (!LoadVT.
isVector() && DMaskLanes > 1))
9414 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9415 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9416 NumVDataDwords = (DMaskLanes + 1) / 2;
9418 NumVDataDwords = DMaskLanes;
9420 AdjustRetType =
true;
9424 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9431 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9432 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9434 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9436 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9437 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9441 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9447 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9451 "Bias needs to be converted to 16 bit in A16 mode");
9456 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9460 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9461 "require 16 bit args for both gradients and addresses");
9466 if (!
ST->hasA16()) {
9467 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9468 "support 16 bit addresses\n");
9478 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9480 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9482 IntrOpcode = G16MappingInfo->
G16;
9505 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9523 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9524 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9525 const bool UseNSA =
ST->hasNSAEncoding() &&
9526 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9527 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9528 const bool UsePartialNSA =
9529 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9532 if (UsePartialNSA) {
9534 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9535 }
else if (!UseNSA) {
9545 uint64_t UnormConst =
9546 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9548 Unorm = UnormConst ? True : False;
9554 bool IsTexFail =
false;
9555 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9566 NumVDataDwords += 1;
9567 AdjustRetType =
true;
9572 if (AdjustRetType) {
9575 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9584 MVT::i32, NumVDataDwords)
9587 ResultTypes[0] = NewVT;
9588 if (ResultTypes.size() == 3) {
9592 ResultTypes.erase(&ResultTypes[1]);
9606 Ops.push_back(VData);
9607 if (UsePartialNSA) {
9609 Ops.push_back(VAddr);
9613 Ops.push_back(VAddr);
9616 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9618 Ops.push_back(Rsrc);
9623 Ops.push_back(Samp);
9628 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9629 Ops.push_back(Unorm);
9631 Ops.push_back(IsA16 &&
9632 ST->hasFeature(AMDGPU::FeatureR128A16)
9636 Ops.push_back(IsA16 ? True : False);
9638 if (!Subtarget->hasGFX90AInsts())
9643 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9646 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9649 Ops.push_back(DimInfo->
DA ? True : False);
9651 Ops.push_back(IsD16 ? True : False);
9653 Ops.push_back(
Op.getOperand(0));
9655 int NumVAddrDwords =
9661 NumVDataDwords, NumVAddrDwords);
9662 }
else if (IsGFX11Plus) {
9664 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9665 : AMDGPU::MIMGEncGfx11Default,
9666 NumVDataDwords, NumVAddrDwords);
9667 }
else if (IsGFX10Plus) {
9669 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9670 : AMDGPU::MIMGEncGfx10Default,
9671 NumVDataDwords, NumVAddrDwords);
9673 if (Subtarget->hasGFX90AInsts()) {
9675 NumVDataDwords, NumVAddrDwords);
9679 "requested image instruction is not supported on this GPU",
9684 for (EVT VT : OrigResultTypes) {
9685 if (VT == MVT::Other)
9686 RetValues[Idx++] =
Op.getOperand(0);
9697 NumVDataDwords, NumVAddrDwords);
9700 NumVDataDwords, NumVAddrDwords);
9707 MachineMemOperand *MemRef = MemOp->getMemOperand();
9726 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9727 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9740 MachinePointerInfo(),
9745 if (!
Offset->isDivergent()) {
9752 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9761 !Subtarget->hasScalarDwordx3Loads()) {
9765 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
9788 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9790 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9794 unsigned NumLoads = 1;
9800 if (NumElts == 8 || NumElts == 16) {
9801 NumLoads = NumElts / 4;
9805 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9810 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9812 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9813 for (
unsigned i = 0; i < NumLoads; ++i) {
9815 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
9819 if (NumElts == 8 || NumElts == 16)
9827 if (!Subtarget->hasArchitectedSGPRs())
9832 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9839 unsigned Width)
const {
9841 using namespace AMDGPU::Hwreg;
9843 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9882 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9884 EVT VT =
Op.getValueType();
9886 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9890 switch (IntrinsicID) {
9891 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9894 return getPreloadedValue(DAG, *MFI, VT,
9897 case Intrinsic::amdgcn_dispatch_ptr:
9898 case Intrinsic::amdgcn_queue_ptr: {
9899 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9901 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9906 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9909 return getPreloadedValue(DAG, *MFI, VT, RegID);
9911 case Intrinsic::amdgcn_implicitarg_ptr: {
9913 return getImplicitArgPtr(DAG,
DL);
9914 return getPreloadedValue(DAG, *MFI, VT,
9917 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9923 return getPreloadedValue(DAG, *MFI, VT,
9926 case Intrinsic::amdgcn_dispatch_id: {
9929 case Intrinsic::amdgcn_rcp:
9930 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
9931 case Intrinsic::amdgcn_rsq:
9932 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
9933 case Intrinsic::amdgcn_rsq_legacy:
9937 case Intrinsic::amdgcn_rcp_legacy:
9940 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
9941 case Intrinsic::amdgcn_rsq_clamp: {
9943 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
9955 case Intrinsic::r600_read_ngroups_x:
9956 if (Subtarget->isAmdHsaOS())
9959 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9962 case Intrinsic::r600_read_ngroups_y:
9963 if (Subtarget->isAmdHsaOS())
9966 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9969 case Intrinsic::r600_read_ngroups_z:
9970 if (Subtarget->isAmdHsaOS())
9973 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9976 case Intrinsic::r600_read_local_size_x:
9977 if (Subtarget->isAmdHsaOS())
9980 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9982 case Intrinsic::r600_read_local_size_y:
9983 if (Subtarget->isAmdHsaOS())
9986 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9988 case Intrinsic::r600_read_local_size_z:
9989 if (Subtarget->isAmdHsaOS())
9992 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9994 case Intrinsic::amdgcn_workgroup_id_x:
9995 return lowerWorkGroupId(DAG, *MFI, VT,
9999 case Intrinsic::amdgcn_workgroup_id_y:
10000 return lowerWorkGroupId(DAG, *MFI, VT,
10004 case Intrinsic::amdgcn_workgroup_id_z:
10005 return lowerWorkGroupId(DAG, *MFI, VT,
10009 case Intrinsic::amdgcn_cluster_id_x:
10010 return Subtarget->hasClusters()
10011 ? getPreloadedValue(DAG, *MFI, VT,
10013 : DAG.getPOISON(VT);
10014 case Intrinsic::amdgcn_cluster_id_y:
10015 return Subtarget->hasClusters()
10016 ? getPreloadedValue(DAG, *MFI, VT,
10019 case Intrinsic::amdgcn_cluster_id_z:
10020 return Subtarget->hasClusters()
10021 ? getPreloadedValue(DAG, *MFI, VT,
10024 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10025 return Subtarget->hasClusters()
10026 ? getPreloadedValue(
10030 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10031 return Subtarget->hasClusters()
10032 ? getPreloadedValue(
10036 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10037 return Subtarget->hasClusters()
10038 ? getPreloadedValue(
10042 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10043 return Subtarget->hasClusters()
10046 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10047 return Subtarget->hasClusters()
10048 ? getPreloadedValue(
10052 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10053 return Subtarget->hasClusters()
10054 ? getPreloadedValue(
10058 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10059 return Subtarget->hasClusters()
10060 ? getPreloadedValue(
10064 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10065 return Subtarget->hasClusters()
10066 ? getPreloadedValue(
10070 case Intrinsic::amdgcn_wave_id:
10071 return lowerWaveID(DAG,
Op);
10072 case Intrinsic::amdgcn_lds_kernel_id: {
10074 return getLDSKernelId(DAG,
DL);
10075 return getPreloadedValue(DAG, *MFI, VT,
10078 case Intrinsic::amdgcn_workitem_id_x:
10079 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10080 case Intrinsic::amdgcn_workitem_id_y:
10081 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10082 case Intrinsic::amdgcn_workitem_id_z:
10083 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10084 case Intrinsic::amdgcn_wavefrontsize:
10086 SDLoc(
Op), MVT::i32);
10087 case Intrinsic::amdgcn_s_buffer_load: {
10088 unsigned CPol =
Op.getConstantOperandVal(3);
10095 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10096 Op.getOperand(3), DAG);
10098 case Intrinsic::amdgcn_fdiv_fast:
10099 return lowerFDIV_FAST(
Op, DAG);
10100 case Intrinsic::amdgcn_sin:
10101 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10103 case Intrinsic::amdgcn_cos:
10104 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10106 case Intrinsic::amdgcn_mul_u24:
10107 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10109 case Intrinsic::amdgcn_mul_i24:
10110 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10113 case Intrinsic::amdgcn_log_clamp: {
10119 case Intrinsic::amdgcn_fract:
10120 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10122 case Intrinsic::amdgcn_class:
10123 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10125 case Intrinsic::amdgcn_div_fmas:
10126 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10127 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10129 case Intrinsic::amdgcn_div_fixup:
10130 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10131 Op.getOperand(2),
Op.getOperand(3));
10133 case Intrinsic::amdgcn_div_scale: {
10139 SDValue Denominator =
Op.getOperand(2);
10146 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10148 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10149 Denominator, Numerator);
10151 case Intrinsic::amdgcn_icmp: {
10153 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10154 Op.getConstantOperandVal(2) == 0 &&
10159 case Intrinsic::amdgcn_fcmp: {
10162 case Intrinsic::amdgcn_ballot:
10164 case Intrinsic::amdgcn_fmed3:
10165 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10166 Op.getOperand(2),
Op.getOperand(3));
10167 case Intrinsic::amdgcn_fdot2:
10168 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10169 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10170 case Intrinsic::amdgcn_fmul_legacy:
10171 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10173 case Intrinsic::amdgcn_sffbh:
10174 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10175 case Intrinsic::amdgcn_sbfe:
10176 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10177 Op.getOperand(2),
Op.getOperand(3));
10178 case Intrinsic::amdgcn_ubfe:
10179 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10180 Op.getOperand(2),
Op.getOperand(3));
10181 case Intrinsic::amdgcn_cvt_pkrtz:
10182 case Intrinsic::amdgcn_cvt_pknorm_i16:
10183 case Intrinsic::amdgcn_cvt_pknorm_u16:
10184 case Intrinsic::amdgcn_cvt_pk_i16:
10185 case Intrinsic::amdgcn_cvt_pk_u16: {
10187 EVT VT =
Op.getValueType();
10190 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10191 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10192 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10193 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10194 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10195 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10196 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10197 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10199 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10202 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10205 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10208 case Intrinsic::amdgcn_fmad_ftz:
10209 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10210 Op.getOperand(2),
Op.getOperand(3));
10212 case Intrinsic::amdgcn_if_break:
10214 Op->getOperand(1),
Op->getOperand(2)),
10217 case Intrinsic::amdgcn_groupstaticsize: {
10223 const GlobalValue *GV =
10229 case Intrinsic::amdgcn_is_shared:
10230 case Intrinsic::amdgcn_is_private: {
10237 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10241 Subtarget->hasGloballyAddressableScratch()) {
10244 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10245 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10254 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10257 case Intrinsic::amdgcn_perm:
10258 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10259 Op.getOperand(2),
Op.getOperand(3));
10260 case Intrinsic::amdgcn_reloc_constant: {
10270 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10271 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10272 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10273 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10274 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10275 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10276 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10277 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10278 if (
Op.getOperand(4).getValueType() == MVT::i32)
10284 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10285 Op.getOperand(3), IndexKeyi32);
10287 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10288 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10289 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10290 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10291 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10292 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10293 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10294 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10295 if (
Op.getOperand(4).getValueType() == MVT::i64)
10301 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10302 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10303 Op.getOperand(6)});
10305 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10306 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10307 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10308 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10309 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10310 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10311 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10314 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10320 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10321 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
10322 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
10323 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10324 Args.push_back(
Op.getOperand(9));
10327 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10328 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10329 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10330 if (
Op.getOperand(6).getValueType() == MVT::i32)
10336 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10337 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10338 IndexKeyi32, Op.getOperand(7)});
10340 case Intrinsic::amdgcn_addrspacecast_nonnull:
10341 return lowerADDRSPACECAST(
Op, DAG);
10342 case Intrinsic::amdgcn_readlane:
10343 case Intrinsic::amdgcn_readfirstlane:
10344 case Intrinsic::amdgcn_writelane:
10345 case Intrinsic::amdgcn_permlane16:
10346 case Intrinsic::amdgcn_permlanex16:
10347 case Intrinsic::amdgcn_permlane64:
10348 case Intrinsic::amdgcn_set_inactive:
10349 case Intrinsic::amdgcn_set_inactive_chain_arg:
10350 case Intrinsic::amdgcn_mov_dpp8:
10351 case Intrinsic::amdgcn_update_dpp:
10353 case Intrinsic::amdgcn_dead: {
10355 for (
const EVT ValTy :
Op.getNode()->values())
10359 case Intrinsic::amdgcn_wave_shuffle:
10362 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10364 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10375 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10381 unsigned NewOpcode)
const {
10385 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10386 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10404 M->getMemOperand());
10409 unsigned NewOpcode)
const {
10413 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10414 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10432 M->getMemOperand());
10437 unsigned IntrID =
Op.getConstantOperandVal(1);
10441 case Intrinsic::amdgcn_ds_ordered_add:
10442 case Intrinsic::amdgcn_ds_ordered_swap: {
10447 unsigned IndexOperand =
M->getConstantOperandVal(7);
10448 unsigned WaveRelease =
M->getConstantOperandVal(8);
10449 unsigned WaveDone =
M->getConstantOperandVal(9);
10451 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10452 IndexOperand &= ~0x3f;
10453 unsigned CountDw = 0;
10456 CountDw = (IndexOperand >> 24) & 0xf;
10457 IndexOperand &= ~(0xf << 24);
10459 if (CountDw < 1 || CountDw > 4) {
10462 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10463 DL.getDebugLoc()));
10468 if (IndexOperand) {
10471 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10474 if (WaveDone && !WaveRelease) {
10478 Fn,
"ds_ordered_count: wave_done requires wave_release",
10479 DL.getDebugLoc()));
10482 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10483 unsigned ShaderType =
10485 unsigned Offset0 = OrderedCountIndex << 2;
10486 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10489 Offset1 |= (CountDw - 1) << 6;
10492 Offset1 |= ShaderType << 2;
10494 unsigned Offset = Offset0 | (Offset1 << 8);
10501 M->getVTList(),
Ops,
M->getMemoryVT(),
10502 M->getMemOperand());
10504 case Intrinsic::amdgcn_raw_buffer_load:
10505 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10506 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10507 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10508 case Intrinsic::amdgcn_raw_buffer_load_format:
10509 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10510 const bool IsFormat =
10511 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10512 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10514 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10515 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10529 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10531 case Intrinsic::amdgcn_struct_buffer_load:
10532 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10533 case Intrinsic::amdgcn_struct_buffer_load_format:
10534 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10535 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10536 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10537 const bool IsFormat =
10538 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10539 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10541 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10542 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10557 case Intrinsic::amdgcn_raw_tbuffer_load:
10558 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10560 EVT LoadVT =
Op.getValueType();
10561 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10562 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10578 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10580 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10581 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10584 case Intrinsic::amdgcn_struct_tbuffer_load:
10585 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10587 EVT LoadVT =
Op.getValueType();
10588 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10589 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10605 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10607 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10608 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10611 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10613 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10614 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10615 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10616 return lowerStructBufferAtomicIntrin(
Op, DAG,
10617 AMDGPUISD::BUFFER_ATOMIC_FADD);
10618 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10619 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10620 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10621 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10622 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10623 return lowerStructBufferAtomicIntrin(
Op, DAG,
10624 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10625 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10626 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10627 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10628 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10629 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10630 return lowerStructBufferAtomicIntrin(
Op, DAG,
10631 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10632 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10633 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10634 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10635 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10636 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10637 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10638 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10639 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10640 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10641 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10643 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10644 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10645 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10646 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10647 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10648 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10649 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10650 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10651 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10652 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10653 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10654 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10655 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10656 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10657 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10658 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10659 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10660 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10661 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10662 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10663 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10664 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10665 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10666 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10667 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10668 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10669 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10670 return lowerStructBufferAtomicIntrin(
Op, DAG,
10671 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10672 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10673 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10674 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10675 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10676 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10677 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10678 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10679 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10680 return lowerStructBufferAtomicIntrin(
Op, DAG,
10681 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10682 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10683 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10684 return lowerStructBufferAtomicIntrin(
Op, DAG,
10685 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10686 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10687 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10688 return lowerStructBufferAtomicIntrin(
Op, DAG,
10689 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10690 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10691 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10692 return lowerStructBufferAtomicIntrin(
Op, DAG,
10693 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10694 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10695 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10696 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10697 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10698 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10699 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10700 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10701 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10702 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10703 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10704 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10705 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10706 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10707 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10708 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10709 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10710 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10711 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10712 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10713 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10714 return lowerStructBufferAtomicIntrin(
Op, DAG,
10715 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10716 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10717 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10718 return lowerRawBufferAtomicIntrin(
Op, DAG,
10719 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10720 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10721 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10722 return lowerStructBufferAtomicIntrin(
Op, DAG,
10723 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10724 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10725 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10726 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10727 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10741 EVT VT =
Op.getValueType();
10745 Op->getVTList(),
Ops, VT,
10746 M->getMemOperand());
10748 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10749 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10750 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10751 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10765 EVT VT =
Op.getValueType();
10769 Op->getVTList(),
Ops, VT,
10770 M->getMemOperand());
10772 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10773 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10775 SDValue NodePtr =
M->getOperand(2);
10776 SDValue RayExtent =
M->getOperand(3);
10777 SDValue InstanceMask =
M->getOperand(4);
10778 SDValue RayOrigin =
M->getOperand(5);
10779 SDValue RayDir =
M->getOperand(6);
10781 SDValue TDescr =
M->getOperand(8);
10786 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10791 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10792 const unsigned NumVDataDwords = 10;
10793 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10795 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10796 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10797 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10801 Ops.push_back(NodePtr);
10804 {DAG.getBitcast(MVT::i32, RayExtent),
10805 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10806 Ops.push_back(RayOrigin);
10807 Ops.push_back(RayDir);
10808 Ops.push_back(Offsets);
10809 Ops.push_back(TDescr);
10810 Ops.push_back(
M->getChain());
10813 MachineMemOperand *MemRef =
M->getMemOperand();
10817 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10819 SDValue NodePtr =
M->getOperand(2);
10820 SDValue RayExtent =
M->getOperand(3);
10821 SDValue RayOrigin =
M->getOperand(4);
10822 SDValue RayDir =
M->getOperand(5);
10823 SDValue RayInvDir =
M->getOperand(6);
10824 SDValue TDescr =
M->getOperand(7);
10831 if (!Subtarget->hasGFX10_AEncoding()) {
10841 const unsigned NumVDataDwords = 4;
10842 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10843 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10844 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10847 const unsigned BaseOpcodes[2][2] = {
10848 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10849 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10850 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10854 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10855 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10856 : AMDGPU::MIMGEncGfx10NSA,
10857 NumVDataDwords, NumVAddrDwords);
10861 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10862 : AMDGPU::MIMGEncGfx10Default,
10863 NumVDataDwords, NumVAddrDwords);
10869 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10872 if (Lanes[0].getValueSizeInBits() == 32) {
10873 for (
unsigned I = 0;
I < 3; ++
I)
10880 Ops.push_back(Lanes[2]);
10892 if (UseNSA && IsGFX11Plus) {
10893 Ops.push_back(NodePtr);
10895 Ops.push_back(RayOrigin);
10900 for (
unsigned I = 0;
I < 3; ++
I) {
10903 {DirLanes[I], InvDirLanes[I]})));
10907 Ops.push_back(RayDir);
10908 Ops.push_back(RayInvDir);
10915 Ops.push_back(NodePtr);
10918 packLanes(RayOrigin,
true);
10919 packLanes(RayDir,
true);
10920 packLanes(RayInvDir,
false);
10925 if (NumVAddrDwords > 12) {
10927 Ops.append(16 -
Ops.size(), Undef);
10933 Ops.push_back(MergedOps);
10936 Ops.push_back(TDescr);
10938 Ops.push_back(
M->getChain());
10941 MachineMemOperand *MemRef =
M->getMemOperand();
10945 case Intrinsic::amdgcn_global_atomic_fmin_num:
10946 case Intrinsic::amdgcn_global_atomic_fmax_num:
10947 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10948 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10955 unsigned Opcode = 0;
10957 case Intrinsic::amdgcn_global_atomic_fmin_num:
10958 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10962 case Intrinsic::amdgcn_global_atomic_fmax_num:
10963 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10970 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10971 Ops,
M->getMemOperand());
10973 case Intrinsic::amdgcn_s_get_barrier_state:
10974 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10981 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10982 BarID = (BarID >> 4) & 0x3F;
10983 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10986 Ops.push_back(Chain);
10988 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10989 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10997 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11005 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11006 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11007 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11011 EVT VT =
Op->getValueType(0);
11017 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11019 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11027SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11034 EVT VT = VTList.
VTs[0];
11037 bool IsTFE = VTList.
NumVTs == 3;
11040 unsigned NumOpDWords = NumValueDWords + 1;
11042 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11043 MachineMemOperand *OpDWordsMMO =
11045 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11046 OpDWordsVT, OpDWordsMMO, DAG);
11051 NumValueDWords == 1
11060 if (!Subtarget->hasDwordx3LoadStores() &&
11061 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11065 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11067 WidenedMemVT, WidenedMMO);
11077 bool ImageStore)
const {
11087 if (Subtarget->hasUnpackedD16VMem()) {
11101 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11112 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11118 if ((NumElements % 2) == 1) {
11120 unsigned I = Elts.
size() / 2;
11136 if (NumElements == 3) {
11157 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11160 switch (IntrinsicID) {
11161 case Intrinsic::amdgcn_exp_compr: {
11162 if (!Subtarget->hasCompressedExport()) {
11165 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11187 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11191 case Intrinsic::amdgcn_struct_tbuffer_store:
11192 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11194 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11196 VData = handleD16VData(VData, DAG);
11197 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11198 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11212 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11213 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11216 M->getMemoryVT(),
M->getMemOperand());
11219 case Intrinsic::amdgcn_raw_tbuffer_store:
11220 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11222 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11224 VData = handleD16VData(VData, DAG);
11225 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11226 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11240 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11241 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11244 M->getMemoryVT(),
M->getMemOperand());
11247 case Intrinsic::amdgcn_raw_buffer_store:
11248 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11249 case Intrinsic::amdgcn_raw_buffer_store_format:
11250 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11251 const bool IsFormat =
11252 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11253 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11260 VData = handleD16VData(VData, DAG);
11270 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11271 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11285 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11286 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11291 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11294 M->getMemoryVT(),
M->getMemOperand());
11297 case Intrinsic::amdgcn_struct_buffer_store:
11298 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11299 case Intrinsic::amdgcn_struct_buffer_store_format:
11300 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11301 const bool IsFormat =
11302 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11303 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11311 VData = handleD16VData(VData, DAG);
11321 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11322 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11336 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11337 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11341 EVT VDataType = VData.getValueType().getScalarType();
11343 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11346 M->getMemoryVT(),
M->getMemOperand());
11348 case Intrinsic::amdgcn_raw_buffer_load_lds:
11349 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11350 case Intrinsic::amdgcn_struct_buffer_load_lds:
11351 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11352 if (!Subtarget->hasVMemToLDSLoad())
11356 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11357 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11358 unsigned OpOffset = HasVIndex ? 1 : 0;
11359 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11361 unsigned Size =
Op->getConstantOperandVal(4);
11367 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11368 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11369 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11370 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11373 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11374 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11375 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11376 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11379 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11380 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11381 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11382 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11385 if (!Subtarget->hasLDSLoadB96_B128())
11387 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11388 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11389 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11390 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11393 if (!Subtarget->hasLDSLoadB96_B128())
11395 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11396 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11397 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11398 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11406 if (HasVIndex && HasVOffset)
11410 else if (HasVIndex)
11411 Ops.push_back(
Op.getOperand(5));
11412 else if (HasVOffset)
11413 Ops.push_back(VOffset);
11415 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11416 Ops.push_back(Rsrc);
11417 Ops.push_back(
Op.getOperand(6 + OpOffset));
11418 Ops.push_back(
Op.getOperand(7 + OpOffset));
11420 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11433 MachineMemOperand *LoadMMO =
M->getMemOperand();
11438 MachinePointerInfo StorePtrI = LoadPtrI;
11462 case Intrinsic::amdgcn_load_to_lds:
11463 case Intrinsic::amdgcn_global_load_lds: {
11464 if (!Subtarget->hasVMemToLDSLoad())
11468 unsigned Size =
Op->getConstantOperandVal(4);
11473 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11476 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11479 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11482 if (!Subtarget->hasLDSLoadB96_B128())
11484 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11487 if (!Subtarget->hasLDSLoadB96_B128())
11489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11505 if (
LHS->isDivergent())
11509 RHS.getOperand(0).getValueType() == MVT::i32) {
11512 VOffset =
RHS.getOperand(0);
11516 Ops.push_back(Addr);
11524 Ops.push_back(VOffset);
11527 Ops.push_back(
Op.getOperand(5));
11529 unsigned Aux =
Op.getConstantOperandVal(6);
11537 MachineMemOperand *LoadMMO =
M->getMemOperand();
11539 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11540 MachinePointerInfo StorePtrI = LoadPtrI;
11559 case Intrinsic::amdgcn_end_cf:
11561 Op->getOperand(2), Chain),
11563 case Intrinsic::amdgcn_s_barrier_init:
11564 case Intrinsic::amdgcn_s_barrier_signal_var: {
11571 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11572 ? AMDGPU::S_BARRIER_INIT_M0
11573 : AMDGPU::S_BARRIER_SIGNAL_M0;
11588 constexpr unsigned ShAmt = 16;
11595 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11600 case Intrinsic::amdgcn_s_wakeup_barrier: {
11601 if (!Subtarget->hasSWakeupBarrier())
11605 case Intrinsic::amdgcn_s_barrier_join: {
11614 switch (IntrinsicID) {
11617 case Intrinsic::amdgcn_s_barrier_join:
11618 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11620 case Intrinsic::amdgcn_s_wakeup_barrier:
11621 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11625 unsigned BarID = (BarVal >> 4) & 0x3F;
11628 Ops.push_back(Chain);
11630 switch (IntrinsicID) {
11633 case Intrinsic::amdgcn_s_barrier_join:
11634 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11636 case Intrinsic::amdgcn_s_wakeup_barrier:
11637 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11648 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11654 case Intrinsic::amdgcn_s_prefetch_data: {
11657 return Op.getOperand(0);
11660 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11662 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11669 Op->getVTList(),
Ops,
M->getMemoryVT(),
11670 M->getMemOperand());
11672 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11673 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11674 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11683 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11685 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11701 return PtrVT == MVT::i64;
11715std::pair<SDValue, SDValue>
11745 unsigned Overflow = ImmOffset & ~MaxImm;
11746 ImmOffset -= Overflow;
11747 if ((int32_t)Overflow < 0) {
11748 Overflow += ImmOffset;
11753 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11772void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11774 Align Alignment)
const {
11776 SDLoc
DL(CombinedOffset);
11778 uint32_t
Imm =
C->getZExtValue();
11779 uint32_t SOffset, ImmOffset;
11780 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11790 uint32_t SOffset, ImmOffset;
11793 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11801 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11810SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11813 return MaybePointer;
11827 SDValue NumRecords =
Op->getOperand(3);
11833 if (Subtarget->has45BitNumRecordsBufferResource()) {
11852 SDValue ExtShiftedStrideVec =
11864 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11866 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11871 auto [LowHalf, HighHalf] =
11872 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11882 NumRecords, Flags);
11894 bool IsTFE)
const {
11899 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11900 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11903 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11915 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11916 : AMDGPUISD::BUFFER_LOAD_USHORT;
11918 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11932 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11936 Ops[1] = BufferStoreExt;
11937 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11938 : AMDGPUISD::BUFFER_STORE_SHORT;
11941 M->getMemOperand());
11966 DAGCombinerInfo &DCI)
const {
11967 SelectionDAG &DAG = DCI.DAG;
11982 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11989 "unexpected vector extload");
12002 "unexpected fp extload");
12020 DCI.AddToWorklist(Cvt.
getNode());
12025 DCI.AddToWorklist(Cvt.
getNode());
12036 if (
Info.isEntryFunction())
12037 return Info.getUserSGPRInfo().hasFlatScratchInit();
12045 EVT MemVT =
Load->getMemoryVT();
12046 MachineMemOperand *MMO =
Load->getMemOperand();
12058 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12086 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12087 "Custom lowering for non-i32 vectors hasn't been implemented.");
12090 unsigned AS =
Load->getAddressSpace();
12097 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12101 !Subtarget->hasMultiDwordFlatScratchAddressing())
12111 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12114 Alignment >=
Align(4) && NumElements < 32) {
12116 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12128 if (NumElements > 4)
12131 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12141 switch (Subtarget->getMaxPrivateElementSize()) {
12147 if (NumElements > 2)
12152 if (NumElements > 4)
12155 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12164 auto Flags =
Load->getMemOperand()->getFlags();
12166 Load->getAlign(), Flags, &
Fast) &&
12175 MemVT, *
Load->getMemOperand())) {
12184 EVT VT =
Op.getValueType();
12221 EVT VT =
Op.getValueType();
12222 const SDNodeFlags
Flags =
Op->getFlags();
12224 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12230 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12233 if (CLHS->isExactlyValue(1.0)) {
12246 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12250 if (CLHS->isExactlyValue(-1.0)) {
12253 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12259 if (!AllowInaccurateRcp &&
12260 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12274 EVT VT =
Op.getValueType();
12275 const SDNodeFlags
Flags =
Op->getFlags();
12277 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12278 if (!AllowInaccurateDiv)
12299 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12309 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12313 return DAG.
getNode(Opcode, SL, VTList,
12322 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12332 Opcode = AMDGPUISD::FMA_W_CHAIN;
12336 return DAG.
getNode(Opcode, SL, VTList,
12342 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12343 return FastLowered;
12346 EVT VT =
Op.getValueType();
12353 if (VT == MVT::bf16) {
12376 unsigned FMADOpCode =
12380 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12383 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12385 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12386 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12396 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12402 SDNodeFlags
Flags =
Op->getFlags();
12412 const APFloat K0Val(0x1p+96f);
12415 const APFloat K1Val(0x1p-32f);
12442 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12443 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12444 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12449 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12450 return FastLowered;
12456 SDNodeFlags
Flags =
Op->getFlags();
12457 Flags.setNoFPExcept(
true);
12465 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12474 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12478 using namespace AMDGPU::Hwreg;
12479 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12483 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12484 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12487 const bool HasDynamicDenormals =
12493 if (!PreservesDenormals) {
12498 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12501 if (HasDynamicDenormals) {
12505 SavedDenormMode =
SDValue(GetReg, 0);
12511 SDNode *EnableDenorm;
12512 if (Subtarget->hasDenormModeInst()) {
12513 const SDValue EnableDenormValue =
12516 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12520 const SDValue EnableDenormValue =
12522 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12523 {EnableDenormValue,
BitField, Glue});
12533 ApproxRcp, One, NegDivScale0, Flags);
12536 ApproxRcp, Fma0, Flags);
12542 NumeratorScaled,
Mul, Flags);
12548 NumeratorScaled, Fma3, Flags);
12550 if (!PreservesDenormals) {
12551 SDNode *DisableDenorm;
12552 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12556 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12558 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12562 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12563 const SDValue DisableDenormValue =
12564 HasDynamicDenormals
12569 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12580 {Fma4, Fma1, Fma3, Scale},
Flags);
12582 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12586 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12587 return FastLowered;
12595 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12601 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12619 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12649 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12651 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12655 EVT VT =
Op.getValueType();
12657 if (VT == MVT::f32)
12658 return LowerFDIV32(
Op, DAG);
12660 if (VT == MVT::f64)
12661 return LowerFDIV64(
Op, DAG);
12663 if (VT == MVT::f16 || VT == MVT::bf16)
12664 return LowerFDIV16(
Op, DAG);
12673 EVT ResultExpVT =
Op->getValueType(1);
12674 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12684 if (Subtarget->hasFractBug()) {
12702 EVT VT =
Store->getMemoryVT();
12704 if (VT == MVT::i1) {
12708 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12712 Store->getValue().getValueType().getScalarType() == MVT::i32);
12714 unsigned AS =
Store->getAddressSpace();
12722 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12726 !Subtarget->hasMultiDwordFlatScratchAddressing())
12733 if (NumElements > 4)
12736 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12740 VT, *
Store->getMemOperand()))
12746 switch (Subtarget->getMaxPrivateElementSize()) {
12750 if (NumElements > 2)
12754 if (NumElements > 4 ||
12755 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12763 auto Flags =
Store->getMemOperand()->getFlags();
12782 assert(!Subtarget->has16BitInsts());
12783 SDNodeFlags
Flags =
Op->getFlags();
12797 SDNodeFlags
Flags =
Op->getFlags();
12798 MVT VT =
Op.getValueType().getSimpleVT();
12906 SDNodeFlags
Flags =
Op->getFlags();
12969 EVT VT =
Op.getValueType();
12979 if (Subtarget->hasTrigReducedRange()) {
12981 TrigVal = DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags);
12986 switch (
Op.getOpcode()) {
12988 return DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
12990 return DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13013 EVT VT =
Op.getValueType();
13021 Op->getVTList(),
Ops, VT,
13030SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13031 DAGCombinerInfo &DCI)
const {
13032 EVT VT =
N->getValueType(0);
13034 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13037 SelectionDAG &DAG = DCI.DAG;
13041 EVT SrcVT = Src.getValueType();
13047 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13050 DCI.AddToWorklist(Cvt.
getNode());
13053 if (ScalarVT != MVT::f32) {
13065 DAGCombinerInfo &DCI)
const {
13076 SelectionDAG &DAG = DCI.DAG;
13095 for (
unsigned I = 0;
I != NumElts; ++
I) {
13119 if (NewElts.
size() == 1)
13141 for (
unsigned I = 0;
I != NumElts; ++
I) {
13176SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13178 DAGCombinerInfo &DCI)
const {
13195 SelectionDAG &DAG = DCI.DAG;
13208 AM.BaseOffs =
Offset.getSExtValue();
13213 EVT VT =
N->getValueType(0);
13219 Flags.setNoUnsignedWrap(
13220 N->getFlags().hasNoUnsignedWrap() &&
13232 switch (
N->getOpcode()) {
13243 DAGCombinerInfo &DCI)
const {
13244 SelectionDAG &DAG = DCI.DAG;
13251 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13252 N->getMemoryVT(), DCI);
13256 NewOps[PtrIdx] = NewPtr;
13265 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13266 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13275SDValue SITargetLowering::splitBinaryBitConstantOp(
13279 uint32_t ValLo =
Lo_32(Val);
13280 uint32_t ValHi =
Hi_32(Val);
13287 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13301 if (V.getValueType() != MVT::i1)
13303 switch (V.getOpcode()) {
13308 case AMDGPUISD::FP_CLASS:
13320 return V.getResNo() == 1;
13322 unsigned IntrinsicID = V.getConstantOperandVal(0);
13323 switch (IntrinsicID) {
13324 case Intrinsic::amdgcn_is_shared:
13325 case Intrinsic::amdgcn_is_private:
13342 if (!(
C & 0x000000ff))
13343 ZeroByteMask |= 0x000000ff;
13344 if (!(
C & 0x0000ff00))
13345 ZeroByteMask |= 0x0000ff00;
13346 if (!(
C & 0x00ff0000))
13347 ZeroByteMask |= 0x00ff0000;
13348 if (!(
C & 0xff000000))
13349 ZeroByteMask |= 0xff000000;
13350 uint32_t NonZeroByteMask = ~ZeroByteMask;
13351 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13364 assert(V.getValueSizeInBits() == 32);
13366 if (V.getNumOperands() != 2)
13375 switch (V.getOpcode()) {
13380 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13385 return (0x03020100 & ~ConstMask) | ConstMask;
13392 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13398 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13405 DAGCombinerInfo &DCI)
const {
13406 if (DCI.isBeforeLegalize())
13409 SelectionDAG &DAG = DCI.DAG;
13410 EVT VT =
N->getValueType(0);
13415 if (VT == MVT::i64 && CRHS) {
13417 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13421 if (CRHS && VT == MVT::i32) {
13431 unsigned Shift = CShift->getZExtValue();
13433 unsigned Offset = NB + Shift;
13434 if ((
Offset & (Bits - 1)) == 0) {
13437 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13458 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13460 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13473 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13478 if (
X !=
LHS.getOperand(1))
13482 const ConstantFPSDNode *C1 =
13499 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13505 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13508 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13516 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13517 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13519 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13520 :
Mask->getZExtValue() & OrdMask;
13523 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13541 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13544 if (LHSMask != ~0u && RHSMask != ~0u) {
13547 if (LHSMask > RHSMask) {
13554 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13555 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13558 if (!(LHSUsedLanes & RHSUsedLanes) &&
13561 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13567 uint32_t
Mask = LHSMask & RHSMask;
13568 for (
unsigned I = 0;
I < 32;
I += 8) {
13569 uint32_t ByteSel = 0xff <<
I;
13570 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13571 Mask &= (0x0c <<
I) & 0xffffffff;
13576 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13579 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13629static const std::optional<ByteProvider<SDValue>>
13631 unsigned Depth = 0) {
13634 return std::nullopt;
13636 if (
Op.getValueSizeInBits() < 8)
13637 return std::nullopt;
13639 if (
Op.getValueType().isVector())
13642 switch (
Op->getOpcode()) {
13654 NarrowVT = VTSign->getVT();
13657 return std::nullopt;
13660 if (SrcIndex >= NarrowByteWidth)
13661 return std::nullopt;
13669 return std::nullopt;
13671 uint64_t BitShift = ShiftOp->getZExtValue();
13673 if (BitShift % 8 != 0)
13674 return std::nullopt;
13676 SrcIndex += BitShift / 8;
13694static const std::optional<ByteProvider<SDValue>>
13696 unsigned StartingIndex = 0) {
13700 return std::nullopt;
13702 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13704 return std::nullopt;
13706 return std::nullopt;
13708 bool IsVec =
Op.getValueType().isVector();
13709 switch (
Op.getOpcode()) {
13712 return std::nullopt;
13717 return std::nullopt;
13721 return std::nullopt;
13724 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13725 return std::nullopt;
13726 if (!
LHS ||
LHS->isConstantZero())
13728 if (!
RHS ||
RHS->isConstantZero())
13730 return std::nullopt;
13735 return std::nullopt;
13739 return std::nullopt;
13741 uint32_t BitMask = BitMaskOp->getZExtValue();
13743 uint32_t IndexMask = 0xFF << (Index * 8);
13745 if ((IndexMask & BitMask) != IndexMask) {
13748 if (IndexMask & BitMask)
13749 return std::nullopt;
13758 return std::nullopt;
13762 if (!ShiftOp ||
Op.getValueType().isVector())
13763 return std::nullopt;
13765 uint64_t BitsProvided =
Op.getValueSizeInBits();
13766 if (BitsProvided % 8 != 0)
13767 return std::nullopt;
13769 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13771 return std::nullopt;
13773 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13774 uint64_t ByteShift = BitShift / 8;
13776 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13777 uint64_t BytesProvided = BitsProvided / 8;
13778 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13779 NewIndex %= BytesProvided;
13786 return std::nullopt;
13790 return std::nullopt;
13792 uint64_t BitShift = ShiftOp->getZExtValue();
13794 return std::nullopt;
13796 auto BitsProvided =
Op.getScalarValueSizeInBits();
13797 if (BitsProvided % 8 != 0)
13798 return std::nullopt;
13800 uint64_t BytesProvided = BitsProvided / 8;
13801 uint64_t ByteShift = BitShift / 8;
13806 return BytesProvided - ByteShift > Index
13814 return std::nullopt;
13818 return std::nullopt;
13820 uint64_t BitShift = ShiftOp->getZExtValue();
13821 if (BitShift % 8 != 0)
13822 return std::nullopt;
13823 uint64_t ByteShift = BitShift / 8;
13829 return Index < ByteShift
13832 Depth + 1, StartingIndex);
13841 return std::nullopt;
13849 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13851 if (NarrowBitWidth % 8 != 0)
13852 return std::nullopt;
13853 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13855 if (Index >= NarrowByteWidth)
13857 ? std::optional<ByteProvider<SDValue>>(
13865 return std::nullopt;
13869 if (NarrowByteWidth >= Index) {
13874 return std::nullopt;
13881 return std::nullopt;
13887 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13888 if (NarrowBitWidth % 8 != 0)
13889 return std::nullopt;
13890 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13895 if (Index >= NarrowByteWidth) {
13897 ? std::optional<ByteProvider<SDValue>>(
13902 if (NarrowByteWidth > Index) {
13906 return std::nullopt;
13911 return std::nullopt;
13914 Depth + 1, StartingIndex);
13920 return std::nullopt;
13921 auto VecIdx = IdxOp->getZExtValue();
13922 auto ScalarSize =
Op.getScalarValueSizeInBits();
13923 if (ScalarSize < 32)
13924 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13926 StartingIndex, Index);
13929 case AMDGPUISD::PERM: {
13931 return std::nullopt;
13935 return std::nullopt;
13938 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13939 if (IdxMask > 0x07 && IdxMask != 0x0c)
13940 return std::nullopt;
13942 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13943 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13945 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13951 return std::nullopt;
13966 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13973 auto MemVT = L->getMemoryVT();
13976 return L->getMemoryVT().getSizeInBits() == 16;
13986 int Low8 = Mask & 0xff;
13987 int Hi8 = (Mask & 0xff00) >> 8;
13989 assert(Low8 < 8 && Hi8 < 8);
13991 bool IsConsecutive = (Hi8 - Low8 == 1);
13996 bool Is16Aligned = !(Low8 % 2);
13998 return IsConsecutive && Is16Aligned;
14006 int Low16 = PermMask & 0xffff;
14007 int Hi16 = (PermMask & 0xffff0000) >> 16;
14017 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14019 if (!OtherOpIs16Bit)
14027 unsigned DWordOffset) {
14032 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14037 if (Src.getValueType().isVector()) {
14038 auto ScalarTySize = Src.getScalarValueSizeInBits();
14039 auto ScalarTy = Src.getValueType().getScalarType();
14040 if (ScalarTySize == 32) {
14044 if (ScalarTySize > 32) {
14047 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14048 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14055 assert(ScalarTySize < 32);
14056 auto NumElements =
TypeSize / ScalarTySize;
14057 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14058 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14059 auto NumElementsIn32 = 32 / ScalarTySize;
14060 auto NumAvailElements = DWordOffset < Trunc32Elements
14062 : NumElements - NormalizedTrunc;
14075 auto ShiftVal = 32 * DWordOffset;
14083 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14088 for (
int i = 0; i < 4; i++) {
14090 std::optional<ByteProvider<SDValue>>
P =
14093 if (!
P ||
P->isConstantZero())
14098 if (PermNodes.
size() != 4)
14101 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14102 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14104 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14105 auto PermOp = PermNodes[i];
14108 int SrcByteAdjust = 4;
14112 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14113 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14115 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14116 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14120 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14121 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14124 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14126 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14129 SDValue Op = *PermNodes[FirstSrc.first].Src;
14131 assert(
Op.getValueSizeInBits() == 32);
14135 int Low16 = PermMask & 0xffff;
14136 int Hi16 = (PermMask & 0xffff0000) >> 16;
14138 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14139 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14142 if (WellFormedLow && WellFormedHi)
14146 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14155 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14156 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14161 assert(
Op.getValueType().isByteSized() &&
14172 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14179 DAGCombinerInfo &DCI)
const {
14180 SelectionDAG &DAG = DCI.DAG;
14184 EVT VT =
N->getValueType(0);
14185 if (VT == MVT::i1) {
14187 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14188 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14190 if (Src !=
RHS.getOperand(0))
14195 if (!CLHS || !CRHS)
14199 static const uint32_t MaxMask = 0x3ff;
14204 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14213 LHS.getOpcode() == AMDGPUISD::PERM &&
14219 Sel |=
LHS.getConstantOperandVal(2);
14221 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14228 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14232 auto usesCombinedOperand = [](SDNode *OrUse) {
14235 !OrUse->getValueType(0).isVector())
14239 for (
auto *VUser : OrUse->users()) {
14240 if (!VUser->getValueType(0).isVector())
14247 if (VUser->getOpcode() == VectorwiseOp)
14253 if (!
any_of(
N->users(), usesCombinedOperand))
14259 if (LHSMask != ~0u && RHSMask != ~0u) {
14262 if (LHSMask > RHSMask) {
14269 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14270 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14273 if (!(LHSUsedLanes & RHSUsedLanes) &&
14276 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14278 LHSMask &= ~RHSUsedLanes;
14279 RHSMask &= ~LHSUsedLanes;
14281 LHSMask |= LHSUsedLanes & 0x04040404;
14283 uint32_t Sel = LHSMask | RHSMask;
14286 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14291 if (LHSMask == ~0u || RHSMask == ~0u) {
14332 return IdentitySrc;
14338 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14353 if (SrcVT == MVT::i32) {
14358 DCI.AddToWorklist(LowOr.
getNode());
14359 DCI.AddToWorklist(HiBits.getNode());
14370 N->getOperand(0), CRHS))
14378 DAGCombinerInfo &DCI)
const {
14379 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14386 SelectionDAG &DAG = DCI.DAG;
14388 EVT VT =
N->getValueType(0);
14389 if (CRHS && VT == MVT::i64) {
14391 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14398 unsigned Opc =
LHS.getOpcode();
14428 LHS->getOperand(0), FNegLHS, FNegRHS);
14437 DAGCombinerInfo &DCI)
const {
14438 if (!Subtarget->has16BitInsts() ||
14442 EVT VT =
N->getValueType(0);
14443 if (VT != MVT::i32)
14447 if (Src.getValueType() != MVT::i16)
14454SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14455 DAGCombinerInfo &DCI)
const {
14461 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14462 VTSign->getVT() == MVT::i8) ||
14463 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14464 VTSign->getVT() == MVT::i16))) {
14465 assert(Subtarget->hasScalarSubwordLoads() &&
14466 "s_buffer_load_{u8, i8} are supported "
14467 "in GFX12 (or newer) architectures.");
14468 EVT VT = Src.getValueType();
14469 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14470 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14471 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14473 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14480 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14481 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14485 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14486 VTSign->getVT() == MVT::i8) ||
14487 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14488 VTSign->getVT() == MVT::i16)) &&
14497 Src.getOperand(6), Src.getOperand(7)};
14500 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14501 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14502 ? AMDGPUISD::BUFFER_LOAD_BYTE
14503 : AMDGPUISD::BUFFER_LOAD_SHORT;
14504 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14505 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14506 return DCI.DAG.getMergeValues(
14507 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14513 DAGCombinerInfo &DCI)
const {
14514 SelectionDAG &DAG = DCI.DAG;
14521 if (
N->getOperand(0).isUndef())
14528 DAGCombinerInfo &DCI)
const {
14529 EVT VT =
N->getValueType(0);
14539 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14546 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14554 unsigned MaxDepth)
const {
14555 unsigned Opcode =
Op.getOpcode();
14560 const auto &
F = CFP->getValueAPF();
14561 if (
F.isNaN() &&
F.isSignaling())
14563 if (!
F.isDenormal())
14595 case AMDGPUISD::FMUL_LEGACY:
14596 case AMDGPUISD::FMAD_FTZ:
14597 case AMDGPUISD::RCP:
14598 case AMDGPUISD::RSQ:
14599 case AMDGPUISD::RSQ_CLAMP:
14600 case AMDGPUISD::RCP_LEGACY:
14601 case AMDGPUISD::RCP_IFLAG:
14602 case AMDGPUISD::LOG:
14603 case AMDGPUISD::EXP:
14604 case AMDGPUISD::DIV_SCALE:
14605 case AMDGPUISD::DIV_FMAS:
14606 case AMDGPUISD::DIV_FIXUP:
14607 case AMDGPUISD::FRACT:
14608 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14609 case AMDGPUISD::CVT_F32_UBYTE0:
14610 case AMDGPUISD::CVT_F32_UBYTE1:
14611 case AMDGPUISD::CVT_F32_UBYTE2:
14612 case AMDGPUISD::CVT_F32_UBYTE3:
14613 case AMDGPUISD::FP_TO_FP16:
14614 case AMDGPUISD::SIN_HW:
14615 case AMDGPUISD::COS_HW:
14626 if (
Op.getValueType() == MVT::i32) {
14632 if (RHS->getZExtValue() == 0xffff0000) {
14642 return Op.getValueType().getScalarType() != MVT::f16;
14652 case AMDGPUISD::CLAMP:
14653 case AMDGPUISD::FMED3:
14654 case AMDGPUISD::FMAX3:
14655 case AMDGPUISD::FMIN3:
14656 case AMDGPUISD::FMAXIMUM3:
14657 case AMDGPUISD::FMINIMUM3: {
14663 if (Subtarget->supportsMinMaxDenormModes() ||
14673 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14685 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14712 if (
Op.getValueType() == MVT::i16) {
14723 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14725 switch (IntrinsicID) {
14726 case Intrinsic::amdgcn_cvt_pkrtz:
14727 case Intrinsic::amdgcn_cubeid:
14728 case Intrinsic::amdgcn_frexp_mant:
14729 case Intrinsic::amdgcn_fdot2:
14730 case Intrinsic::amdgcn_rcp:
14731 case Intrinsic::amdgcn_rsq:
14732 case Intrinsic::amdgcn_rsq_clamp:
14733 case Intrinsic::amdgcn_rcp_legacy:
14734 case Intrinsic::amdgcn_rsq_legacy:
14735 case Intrinsic::amdgcn_trig_preop:
14736 case Intrinsic::amdgcn_tanh:
14737 case Intrinsic::amdgcn_log:
14738 case Intrinsic::amdgcn_exp2:
14739 case Intrinsic::amdgcn_sqrt:
14757 unsigned MaxDepth)
const {
14760 unsigned Opcode =
MI->getOpcode();
14762 if (Opcode == AMDGPU::G_FCANONICALIZE)
14765 std::optional<FPValueAndVReg> FCR;
14768 if (FCR->Value.isSignaling())
14770 if (!FCR->Value.isDenormal())
14781 case AMDGPU::G_FADD:
14782 case AMDGPU::G_FSUB:
14783 case AMDGPU::G_FMUL:
14784 case AMDGPU::G_FCEIL:
14785 case AMDGPU::G_FFLOOR:
14786 case AMDGPU::G_FRINT:
14787 case AMDGPU::G_FNEARBYINT:
14788 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14789 case AMDGPU::G_INTRINSIC_TRUNC:
14790 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14791 case AMDGPU::G_FMA:
14792 case AMDGPU::G_FMAD:
14793 case AMDGPU::G_FSQRT:
14794 case AMDGPU::G_FDIV:
14795 case AMDGPU::G_FREM:
14796 case AMDGPU::G_FPOW:
14797 case AMDGPU::G_FPEXT:
14798 case AMDGPU::G_FLOG:
14799 case AMDGPU::G_FLOG2:
14800 case AMDGPU::G_FLOG10:
14801 case AMDGPU::G_FPTRUNC:
14802 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14803 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14804 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14805 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14806 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14808 case AMDGPU::G_FNEG:
14809 case AMDGPU::G_FABS:
14810 case AMDGPU::G_FCOPYSIGN:
14812 case AMDGPU::G_FMINNUM:
14813 case AMDGPU::G_FMAXNUM:
14814 case AMDGPU::G_FMINNUM_IEEE:
14815 case AMDGPU::G_FMAXNUM_IEEE:
14816 case AMDGPU::G_FMINIMUM:
14817 case AMDGPU::G_FMAXIMUM:
14818 case AMDGPU::G_FMINIMUMNUM:
14819 case AMDGPU::G_FMAXIMUMNUM: {
14820 if (Subtarget->supportsMinMaxDenormModes() ||
14827 case AMDGPU::G_BUILD_VECTOR:
14832 case AMDGPU::G_INTRINSIC:
14833 case AMDGPU::G_INTRINSIC_CONVERGENT:
14835 case Intrinsic::amdgcn_fmul_legacy:
14836 case Intrinsic::amdgcn_fmad_ftz:
14837 case Intrinsic::amdgcn_sqrt:
14838 case Intrinsic::amdgcn_fmed3:
14839 case Intrinsic::amdgcn_sin:
14840 case Intrinsic::amdgcn_cos:
14841 case Intrinsic::amdgcn_log:
14842 case Intrinsic::amdgcn_exp2:
14843 case Intrinsic::amdgcn_log_clamp:
14844 case Intrinsic::amdgcn_rcp:
14845 case Intrinsic::amdgcn_rcp_legacy:
14846 case Intrinsic::amdgcn_rsq:
14847 case Intrinsic::amdgcn_rsq_clamp:
14848 case Intrinsic::amdgcn_rsq_legacy:
14849 case Intrinsic::amdgcn_div_scale:
14850 case Intrinsic::amdgcn_div_fmas:
14851 case Intrinsic::amdgcn_div_fixup:
14852 case Intrinsic::amdgcn_fract:
14853 case Intrinsic::amdgcn_cvt_pkrtz:
14854 case Intrinsic::amdgcn_cubeid:
14855 case Intrinsic::amdgcn_cubema:
14856 case Intrinsic::amdgcn_cubesc:
14857 case Intrinsic::amdgcn_cubetc:
14858 case Intrinsic::amdgcn_frexp_mant:
14859 case Intrinsic::amdgcn_fdot2:
14860 case Intrinsic::amdgcn_trig_preop:
14861 case Intrinsic::amdgcn_tanh:
14880 if (
C.isDenormal()) {
14894 if (
C.isSignaling()) {
14917SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14918 DAGCombinerInfo &DCI)
const {
14919 SelectionDAG &DAG = DCI.DAG;
14921 EVT VT =
N->getValueType(0);
14930 EVT VT =
N->getValueType(0);
14931 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14947 EVT EltVT =
Lo.getValueType();
14950 for (
unsigned I = 0;
I != 2; ++
I) {
14954 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14955 }
else if (
Op.isUndef()) {
14991 return AMDGPUISD::FMAX3;
14993 return AMDGPUISD::FMAXIMUM3;
14995 return AMDGPUISD::SMAX3;
14997 return AMDGPUISD::UMAX3;
15001 return AMDGPUISD::FMIN3;
15003 return AMDGPUISD::FMINIMUM3;
15005 return AMDGPUISD::SMIN3;
15007 return AMDGPUISD::UMIN3;
15028 if (!MinK || !MaxK)
15040 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15041 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15042 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15101 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15107 if (
Info->getMode().DX10Clamp) {
15116 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15148 case AMDGPUISD::FMIN_LEGACY:
15149 case AMDGPUISD::FMAX_LEGACY:
15150 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15161 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15170 DAGCombinerInfo &DCI)
const {
15171 SelectionDAG &DAG = DCI.DAG;
15203 if (
SDValue Med3 = performIntMed3ImmCombine(
15208 if (
SDValue Med3 = performIntMed3ImmCombine(
15214 if (
SDValue Med3 = performIntMed3ImmCombine(
15219 if (
SDValue Med3 = performIntMed3ImmCombine(
15232 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15233 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15234 (VT == MVT::f32 || VT == MVT::f64 ||
15235 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15236 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15237 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15238 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15240 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15247 const SDNodeFlags
Flags =
N->getFlags();
15249 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15252 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15262 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15263 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15272 DAGCombinerInfo &DCI)
const {
15273 EVT VT =
N->getValueType(0);
15277 SelectionDAG &DAG = DCI.DAG;
15288 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15292 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15296 if (
Info->getMode().DX10Clamp) {
15309 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15316 DAGCombinerInfo &DCI)
const {
15320 return DCI.DAG.getUNDEF(
N->getValueType(0));
15328 bool IsDivergentIdx,
15333 unsigned VecSize = EltSize * NumElem;
15336 if (VecSize <= 64 && EltSize < 32)
15345 if (IsDivergentIdx)
15349 unsigned NumInsts = NumElem +
15350 ((EltSize + 31) / 32) * NumElem ;
15354 if (Subtarget->useVGPRIndexMode())
15355 return NumInsts <= 16;
15359 if (Subtarget->hasMovrel())
15360 return NumInsts <= 15;
15366 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15381SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15382 DAGCombinerInfo &DCI)
const {
15388 EVT ResVT =
N->getValueType(0);
15412 if (!
C ||
C->getZExtValue() != 0x1f)
15428 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15456 DCI.AddToWorklist(Elt0.
getNode());
15457 DCI.AddToWorklist(Elt1.
getNode());
15479 if (!DCI.isBeforeLegalize())
15487 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15490 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15491 unsigned EltIdx = BitIndex / 32;
15492 unsigned LeftoverBitIdx = BitIndex % 32;
15496 DCI.AddToWorklist(Cast.
getNode());
15500 DCI.AddToWorklist(Elt.
getNode());
15503 DCI.AddToWorklist(Srl.
getNode());
15507 DCI.AddToWorklist(Trunc.
getNode());
15509 if (VecEltVT == ResVT) {
15521SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15522 DAGCombinerInfo &DCI)
const {
15533 SelectionDAG &DAG = DCI.DAG;
15553 Src.getOperand(0).getValueType() == MVT::f16) {
15554 return Src.getOperand(0);
15558 APFloat Val = CFP->getValueAPF();
15559 bool LosesInfo =
true;
15569 DAGCombinerInfo &DCI)
const {
15570 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15571 "combine only useful on gfx8");
15573 SDValue TruncSrc =
N->getOperand(0);
15574 EVT VT =
N->getValueType(0);
15575 if (VT != MVT::f16)
15578 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15582 SelectionDAG &DAG = DCI.DAG;
15613unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15615 const SDNode *N1)
const {
15620 if (((VT == MVT::f32 &&
15622 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15642 EVT VT =
N->getValueType(0);
15643 if (VT != MVT::i32 && VT != MVT::i64)
15649 unsigned Opc =
N->getOpcode();
15704 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15723 DAGCombinerInfo &DCI)
const {
15726 SelectionDAG &DAG = DCI.DAG;
15727 EVT VT =
N->getValueType(0);
15737 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15741 if (NumBits <= 32 || NumBits > 64)
15752 if (!Subtarget->hasFullRate64Ops()) {
15753 unsigned NumUsers = 0;
15754 for (SDNode *User :
LHS->
users()) {
15757 if (!
User->isAnyAdd())
15781 bool MulSignedLo =
false;
15782 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15791 if (VT != MVT::i64) {
15814 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15816 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15817 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15819 if (!MulLHSUnsigned32) {
15826 if (!MulRHSUnsigned32) {
15837 if (VT != MVT::i64)
15843SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15844 DAGCombinerInfo &DCI)
const {
15854 SelectionDAG &DAG = DCI.DAG;
15869 unsigned Opcode =
N->getOpcode();
15873 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15884static std::optional<ByteProvider<SDValue>>
15887 if (!Byte0 || Byte0->isConstantZero()) {
15888 return std::nullopt;
15891 if (Byte1 && !Byte1->isConstantZero()) {
15892 return std::nullopt;
15898 unsigned FirstCs =
First & 0x0c0c0c0c;
15899 unsigned SecondCs = Second & 0x0c0c0c0c;
15900 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15901 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15903 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15904 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15905 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15906 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15908 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15932 for (
int BPI = 0; BPI < 2; BPI++) {
15935 BPP = {Src1, Src0};
15937 unsigned ZeroMask = 0x0c0c0c0c;
15938 unsigned FMask = 0xFF << (8 * (3 - Step));
15940 unsigned FirstMask =
15941 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15942 unsigned SecondMask =
15943 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15947 int FirstGroup = -1;
15948 for (
int I = 0;
I < 2;
I++) {
15950 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15951 return IterElt.SrcOp == *BPP.first.Src &&
15952 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15956 if (Match != Srcs.
end()) {
15957 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15962 if (FirstGroup != -1) {
15964 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15965 return IterElt.SrcOp == *BPP.second.Src &&
15966 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15969 if (Match != Srcs.
end()) {
15970 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15972 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15980 unsigned ZeroMask = 0x0c0c0c0c;
15981 unsigned FMask = 0xFF << (8 * (3 - Step));
15985 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15989 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15998 if (Srcs.
size() == 1) {
15999 auto *Elt = Srcs.
begin();
16003 if (Elt->PermMask == 0x3020100)
16006 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16010 auto *FirstElt = Srcs.
begin();
16011 auto *SecondElt = std::next(FirstElt);
16018 auto FirstMask = FirstElt->PermMask;
16019 auto SecondMask = SecondElt->PermMask;
16021 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16022 unsigned FirstPlusFour = FirstMask | 0x04040404;
16025 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16037 FirstElt = std::next(SecondElt);
16038 if (FirstElt == Srcs.
end())
16041 SecondElt = std::next(FirstElt);
16044 if (SecondElt == Srcs.
end()) {
16049 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16050 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16056 return Perms.
size() == 2
16062 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16063 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16064 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16065 EntryMask += ZeroMask;
16070 auto Opcode =
Op.getOpcode();
16072 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16073 Opcode == AMDGPUISD::MUL_I24);
16076static std::optional<bool>
16087 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16090 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16092 assert(!(S0IsUnsigned && S0IsSigned));
16093 assert(!(S1IsUnsigned && S1IsSigned));
16101 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16107 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16108 return std::nullopt;
16120 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16121 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16126 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16132 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16133 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16134 return std::nullopt;
16140 DAGCombinerInfo &DCI)
const {
16141 SelectionDAG &DAG = DCI.DAG;
16142 EVT VT =
N->getValueType(0);
16148 if (Subtarget->hasMad64_32()) {
16149 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16154 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16158 if (VT == MVT::i64) {
16159 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16164 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16166 std::optional<bool> IsSigned;
16172 int ChainLength = 0;
16173 for (
int I = 0;
I < 4;
I++) {
16177 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16180 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16185 TempNode->getOperand(MulIdx), *Src0, *Src1,
16186 TempNode->getOperand(MulIdx)->getOperand(0),
16187 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16191 IsSigned = *IterIsSigned;
16192 if (*IterIsSigned != *IsSigned)
16195 auto AddIdx = 1 - MulIdx;
16198 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16199 Src2s.
push_back(TempNode->getOperand(AddIdx));
16209 TempNode->getOperand(AddIdx), *Src0, *Src1,
16210 TempNode->getOperand(AddIdx)->getOperand(0),
16211 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16215 if (*IterIsSigned != *IsSigned)
16219 ChainLength =
I + 2;
16223 TempNode = TempNode->getOperand(AddIdx);
16225 ChainLength =
I + 1;
16226 if (TempNode->getNumOperands() < 2)
16228 LHS = TempNode->getOperand(0);
16229 RHS = TempNode->getOperand(1);
16232 if (ChainLength < 2)
16238 if (ChainLength < 4) {
16248 bool UseOriginalSrc =
false;
16249 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16250 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16251 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16252 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16253 SmallVector<unsigned, 4> SrcBytes;
16254 auto Src0Mask = Src0s.
begin()->PermMask;
16255 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16256 bool UniqueEntries =
true;
16257 for (
auto I = 1;
I < 4;
I++) {
16258 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16261 UniqueEntries =
false;
16267 if (UniqueEntries) {
16268 UseOriginalSrc =
true;
16270 auto *FirstElt = Src0s.
begin();
16274 auto *SecondElt = Src1s.
begin();
16276 SecondElt->DWordOffset);
16285 if (!UseOriginalSrc) {
16292 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16295 : Intrinsic::amdgcn_udot4,
16305 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16310 unsigned Opc =
LHS.getOpcode();
16322 auto Cond =
RHS.getOperand(0);
16327 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16344 DAGCombinerInfo &DCI)
const {
16345 SelectionDAG &DAG = DCI.DAG;
16347 EVT VT =
N->getValueType(0);
16360 SDNodeFlags ShlFlags = N1->
getFlags();
16364 SDNodeFlags NewShlFlags =
16369 DCI.AddToWorklist(Inner.
getNode());
16376 if (Subtarget->hasMad64_32()) {
16377 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16386 if (VT == MVT::i64) {
16387 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16400 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16401 Y->isDivergent() !=
Z->isDivergent()) {
16410 if (
Y->isDivergent())
16413 SDNodeFlags ReassocFlags =
16416 DCI.AddToWorklist(UniformInner.
getNode());
16424 DAGCombinerInfo &DCI)
const {
16425 SelectionDAG &DAG = DCI.DAG;
16426 EVT VT =
N->getValueType(0);
16428 if (VT == MVT::i64) {
16429 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16433 if (VT != MVT::i32)
16442 unsigned Opc =
RHS.getOpcode();
16449 auto Cond =
RHS.getOperand(0);
16454 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16472SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16473 DAGCombinerInfo &DCI)
const {
16475 if (
N->getValueType(0) != MVT::i32)
16481 SelectionDAG &DAG = DCI.DAG;
16486 unsigned LHSOpc =
LHS.getOpcode();
16487 unsigned Opc =
N->getOpcode();
16491 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16497 DAGCombinerInfo &DCI)
const {
16501 SelectionDAG &DAG = DCI.DAG;
16502 EVT VT =
N->getValueType(0);
16514 if (
A ==
LHS.getOperand(1)) {
16515 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16516 if (FusedOp != 0) {
16518 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16526 if (
A ==
RHS.getOperand(1)) {
16527 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16528 if (FusedOp != 0) {
16530 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16539 DAGCombinerInfo &DCI)
const {
16543 SelectionDAG &DAG = DCI.DAG;
16545 EVT VT =
N->getValueType(0);
16558 if (
A ==
LHS.getOperand(1)) {
16559 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16560 if (FusedOp != 0) {
16564 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16573 if (
A ==
RHS.getOperand(1)) {
16574 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16575 if (FusedOp != 0) {
16577 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16586 DAGCombinerInfo &DCI)
const {
16587 SelectionDAG &DAG = DCI.DAG;
16589 EVT VT =
N->getValueType(0);
16598 SDNodeFlags
Flags =
N->getFlags();
16599 SDNodeFlags RHSFlags =
RHS->getFlags();
16605 bool IsNegative =
false;
16606 if (CLHS->isExactlyValue(1.0) ||
16607 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16613 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16623 DAGCombinerInfo &DCI)
const {
16624 SelectionDAG &DAG = DCI.DAG;
16625 EVT VT =
N->getValueType(0);
16629 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16630 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16645 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16650 const ConstantFPSDNode *FalseNode =
16660 if (ScalarVT == MVT::f32 &&
16666 if (TrueNodeExpVal == INT_MIN)
16669 if (FalseNodeExpVal == INT_MIN)
16689 DAGCombinerInfo &DCI)
const {
16690 SelectionDAG &DAG = DCI.DAG;
16691 EVT VT =
N->getValueType(0);
16694 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16712 (
N->getFlags().hasAllowContract() &&
16713 FMA->getFlags().hasAllowContract())) {
16747 if (Vec1 == Vec2 || Vec3 == Vec4)
16753 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16754 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16762 DAGCombinerInfo &DCI)
const {
16763 SelectionDAG &DAG = DCI.DAG;
16768 EVT VT =
LHS.getValueType();
16797 return LHS.getOperand(0);
16805 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16812 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16813 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16821 return LHS.getOperand(0);
16853 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16858 {Op0Hi, Op1Hi, CarryInHi});
16868 DCI.CombineTo(
LHS.getNode(), Result);
16872 if (VT != MVT::f32 && VT != MVT::f64 &&
16873 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16888 const unsigned IsInfMask =
16890 const unsigned IsFiniteMask =
16895 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
16904SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16905 DAGCombinerInfo &DCI)
const {
16906 SelectionDAG &DAG = DCI.DAG;
16908 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16927 unsigned ShiftOffset = 8 *
Offset;
16929 ShiftOffset -=
C->getZExtValue();
16931 ShiftOffset +=
C->getZExtValue();
16933 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16934 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16935 MVT::f32, Shifted);
16946 DCI.AddToWorklist(
N);
16953 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16959 DAGCombinerInfo &DCI)
const {
16964 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16968 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16969 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16972 APFloat One(
F.getSemantics(),
"1.0");
16974 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16980 DAGCombinerInfo &DCI)
const {
17001 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
17002 bool isInteger =
LHS.getValueType().isInteger();
17005 if (!isFloatingPoint && !isInteger)
17010 if (!isEquality && !isNonEquality)
17027 if (isFloatingPoint) {
17029 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17040 if (!(isEquality && TrueVal == ConstVal) &&
17041 !(isNonEquality && FalseVal == ConstVal))
17048 SelectLHS, SelectRHS);
17053 switch (
N->getOpcode()) {
17069 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17079 switch (
N->getOpcode()) {
17081 return performAddCombine(
N, DCI);
17083 return performPtrAddCombine(
N, DCI);
17085 return performSubCombine(
N, DCI);
17088 return performAddCarrySubCarryCombine(
N, DCI);
17090 return performFAddCombine(
N, DCI);
17092 return performFSubCombine(
N, DCI);
17094 return performFDivCombine(
N, DCI);
17096 return performFMulCombine(
N, DCI);
17098 return performSetCCCombine(
N, DCI);
17100 if (
auto Res = performSelectCombine(
N, DCI))
17115 case AMDGPUISD::FMIN_LEGACY:
17116 case AMDGPUISD::FMAX_LEGACY:
17117 return performMinMaxCombine(
N, DCI);
17119 return performFMACombine(
N, DCI);
17121 return performAndCombine(
N, DCI);
17123 return performOrCombine(
N, DCI);
17126 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17127 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17133 return performXorCombine(
N, DCI);
17135 return performZeroExtendCombine(
N, DCI);
17137 return performSignExtendInRegCombine(
N, DCI);
17138 case AMDGPUISD::FP_CLASS:
17139 return performClassCombine(
N, DCI);
17141 return performFCanonicalizeCombine(
N, DCI);
17142 case AMDGPUISD::RCP:
17143 return performRcpCombine(
N, DCI);
17145 case AMDGPUISD::FRACT:
17146 case AMDGPUISD::RSQ:
17147 case AMDGPUISD::RCP_LEGACY:
17148 case AMDGPUISD::RCP_IFLAG:
17149 case AMDGPUISD::RSQ_CLAMP: {
17158 return performUCharToFloatCombine(
N, DCI);
17160 return performFCopySignCombine(
N, DCI);
17161 case AMDGPUISD::CVT_F32_UBYTE0:
17162 case AMDGPUISD::CVT_F32_UBYTE1:
17163 case AMDGPUISD::CVT_F32_UBYTE2:
17164 case AMDGPUISD::CVT_F32_UBYTE3:
17165 return performCvtF32UByteNCombine(
N, DCI);
17166 case AMDGPUISD::FMED3:
17167 return performFMed3Combine(
N, DCI);
17168 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17169 return performCvtPkRTZCombine(
N, DCI);
17170 case AMDGPUISD::CLAMP:
17171 return performClampCombine(
N, DCI);
17174 EVT VT =
N->getValueType(0);
17177 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17180 EVT EltVT = Src.getValueType();
17181 if (EltVT != MVT::i16)
17191 return performExtractVectorEltCombine(
N, DCI);
17193 return performInsertVectorEltCombine(
N, DCI);
17195 return performFPRoundCombine(
N, DCI);
17204 return performMemSDNodeCombine(MemNode, DCI);
17235 unsigned Opcode =
Node->getMachineOpcode();
17238 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17239 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17242 SDNode *
Users[5] = {
nullptr};
17244 unsigned DmaskIdx =
17245 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17246 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17247 unsigned NewDmask = 0;
17248 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17249 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17250 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17251 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17252 unsigned TFCLane = 0;
17253 bool HasChain =
Node->getNumValues() > 1;
17255 if (OldDmask == 0) {
17263 TFCLane = OldBitsSet;
17267 for (SDUse &Use :
Node->uses()) {
17270 if (
Use.getResNo() != 0)
17273 SDNode *
User =
Use.getUser();
17276 if (!
User->isMachineOpcode() ||
17277 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17289 if (UsesTFC && Lane == TFCLane) {
17294 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17296 Dmask &= ~(1 << Comp);
17304 NewDmask |= 1 << Comp;
17309 bool NoChannels = !NewDmask;
17316 if (OldBitsSet == 1)
17322 if (NewDmask == OldDmask)
17331 unsigned NewChannels = BitsSet + UsesTFC;
17335 assert(NewOpcode != -1 &&
17336 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17337 "failed to find equivalent MIMG op");
17345 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17347 MVT ResultVT = NewChannels == 1
17350 : NewChannels == 5 ? 8
17352 SDVTList NewVTList =
17355 MachineSDNode *NewNode =
17364 if (NewChannels == 1) {
17374 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17379 if (i || !NoChannels)
17384 if (NewUser != User) {
17394 Idx = AMDGPU::sub1;
17397 Idx = AMDGPU::sub2;
17400 Idx = AMDGPU::sub3;
17403 Idx = AMDGPU::sub4;
17414 Op =
Op.getOperand(0);
17435 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17439 Node->getOperand(0), SL, VReg, SrcVal,
17445 return ToResultReg.
getNode();
17450 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17452 Ops.push_back(
Node->getOperand(i));
17458 Node->getOperand(i).getValueType(),
17459 Node->getOperand(i)),
17471 unsigned Opcode =
Node->getMachineOpcode();
17473 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17474 !
TII->isGather4(Opcode) &&
17476 return adjustWritemask(
Node, DAG);
17479 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17485 case AMDGPU::V_DIV_SCALE_F32_e64:
17486 case AMDGPU::V_DIV_SCALE_F64_e64: {
17496 (Src0 == Src1 || Src0 == Src2))
17552 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17553 unsigned InitIdx = 0;
17555 if (
TII->isImage(
MI)) {
17563 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17564 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17565 unsigned D16Val = D16 ? D16->getImm() : 0;
17567 if (!TFEVal && !LWEVal)
17578 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17580 unsigned dmask = MO_Dmask->
getImm();
17585 bool Packed = !Subtarget->hasUnpackedD16VMem();
17587 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17594 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
17595 if (DstSize < InitIdx)
17599 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
17607 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17608 unsigned NewDst = 0;
17613 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17614 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17617 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17618 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17638 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17650 if (
TII->isVOP3(
MI.getOpcode())) {
17652 TII->legalizeOperandsVOP3(
MRI,
MI);
17654 if (
TII->isMAI(
MI)) {
17659 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17660 AMDGPU::OpName::scale_src0);
17661 if (Src0Idx != -1) {
17662 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17663 AMDGPU::OpName::scale_src1);
17664 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17665 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17666 TII->legalizeOpWithMove(
MI, Src1Idx);
17673 if (
TII->isImage(
MI))
17674 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17748std::pair<unsigned, const TargetRegisterClass *>
17755 if (Constraint.
size() == 1) {
17759 if (VT == MVT::Other)
17762 switch (Constraint[0]) {
17769 RC = &AMDGPU::SReg_32RegClass;
17772 RC = &AMDGPU::SGPR_64RegClass;
17777 return std::pair(0U,
nullptr);
17784 return std::pair(0U,
nullptr);
17786 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17787 : &AMDGPU::VGPR_32_Lo256RegClass;
17790 RC = Subtarget->has1024AddressableVGPRs()
17791 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17794 return std::pair(0U,
nullptr);
17799 if (!Subtarget->hasMAIInsts())
17803 return std::pair(0U,
nullptr);
17805 RC = &AMDGPU::AGPR_32RegClass;
17810 return std::pair(0U,
nullptr);
17815 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17819 RC = &AMDGPU::AV_32RegClass;
17822 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17824 return std::pair(0U,
nullptr);
17833 return std::pair(0U, RC);
17836 if (Kind !=
'\0') {
17838 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17839 }
else if (Kind ==
's') {
17840 RC = &AMDGPU::SGPR_32RegClass;
17841 }
else if (Kind ==
'a') {
17842 RC = &AMDGPU::AGPR_32RegClass;
17848 return std::pair(0U,
nullptr);
17854 return std::pair(0U,
nullptr);
17858 RC =
TRI->getVGPRClassForBitWidth(Width);
17860 RC =
TRI->getSGPRClassForBitWidth(Width);
17862 RC =
TRI->getAGPRClassForBitWidth(Width);
17864 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17869 return std::pair(0U,
nullptr);
17871 return std::pair(Reg, RC);
17877 return std::pair(0U,
nullptr);
17878 if (Idx < RC->getNumRegs())
17880 return std::pair(0U,
nullptr);
17886 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17892 if (Constraint.
size() == 1) {
17893 switch (Constraint[0]) {
17903 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17911 if (Constraint.
size() == 1) {
17912 switch (Constraint[0]) {
17920 }
else if (Constraint.
size() == 2) {
17921 if (Constraint ==
"VA")
17939 std::vector<SDValue> &
Ops,
17954 unsigned Size =
Op.getScalarValueSizeInBits();
17958 if (
Size == 16 && !Subtarget->has16BitInsts())
17962 Val =
C->getSExtValue();
17966 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17970 if (
Size != 16 ||
Op.getNumOperands() != 2)
17972 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17975 Val =
C->getSExtValue();
17979 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17989 if (Constraint.
size() == 1) {
17990 switch (Constraint[0]) {
18005 }
else if (Constraint.
size() == 2) {
18006 if (Constraint ==
"DA") {
18007 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
18008 int64_t LoBits =
static_cast<int32_t
>(Val);
18012 if (Constraint ==
"DB") {
18020 unsigned MaxSize)
const {
18021 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
18022 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18024 MVT VT =
Op.getSimpleValueType();
18049 switch (UnalignedClassID) {
18050 case AMDGPU::VReg_64RegClassID:
18051 return AMDGPU::VReg_64_Align2RegClassID;
18052 case AMDGPU::VReg_96RegClassID:
18053 return AMDGPU::VReg_96_Align2RegClassID;
18054 case AMDGPU::VReg_128RegClassID:
18055 return AMDGPU::VReg_128_Align2RegClassID;
18056 case AMDGPU::VReg_160RegClassID:
18057 return AMDGPU::VReg_160_Align2RegClassID;
18058 case AMDGPU::VReg_192RegClassID:
18059 return AMDGPU::VReg_192_Align2RegClassID;
18060 case AMDGPU::VReg_224RegClassID:
18061 return AMDGPU::VReg_224_Align2RegClassID;
18062 case AMDGPU::VReg_256RegClassID:
18063 return AMDGPU::VReg_256_Align2RegClassID;
18064 case AMDGPU::VReg_288RegClassID:
18065 return AMDGPU::VReg_288_Align2RegClassID;
18066 case AMDGPU::VReg_320RegClassID:
18067 return AMDGPU::VReg_320_Align2RegClassID;
18068 case AMDGPU::VReg_352RegClassID:
18069 return AMDGPU::VReg_352_Align2RegClassID;
18070 case AMDGPU::VReg_384RegClassID:
18071 return AMDGPU::VReg_384_Align2RegClassID;
18072 case AMDGPU::VReg_512RegClassID:
18073 return AMDGPU::VReg_512_Align2RegClassID;
18074 case AMDGPU::VReg_1024RegClassID:
18075 return AMDGPU::VReg_1024_Align2RegClassID;
18076 case AMDGPU::AReg_64RegClassID:
18077 return AMDGPU::AReg_64_Align2RegClassID;
18078 case AMDGPU::AReg_96RegClassID:
18079 return AMDGPU::AReg_96_Align2RegClassID;
18080 case AMDGPU::AReg_128RegClassID:
18081 return AMDGPU::AReg_128_Align2RegClassID;
18082 case AMDGPU::AReg_160RegClassID:
18083 return AMDGPU::AReg_160_Align2RegClassID;
18084 case AMDGPU::AReg_192RegClassID:
18085 return AMDGPU::AReg_192_Align2RegClassID;
18086 case AMDGPU::AReg_256RegClassID:
18087 return AMDGPU::AReg_256_Align2RegClassID;
18088 case AMDGPU::AReg_512RegClassID:
18089 return AMDGPU::AReg_512_Align2RegClassID;
18090 case AMDGPU::AReg_1024RegClassID:
18091 return AMDGPU::AReg_1024_Align2RegClassID;
18107 if (Info->isEntryFunction()) {
18114 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18116 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18117 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18118 &AMDGPU::SGPR_64RegClass);
18119 Info->setSGPRForEXECCopy(SReg);
18121 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18122 Info->getStackPtrOffsetReg()));
18123 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18124 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18128 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18129 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18131 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18132 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18134 Info->limitOccupancy(MF);
18136 if (ST.isWave32() && !MF.
empty()) {
18137 for (
auto &
MBB : MF) {
18138 for (
auto &
MI :
MBB) {
18139 TII->fixImplicitOperands(
MI);
18149 if (ST.needsAlignedVGPRs()) {
18150 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18156 if (NewClassID != -1)
18157 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18166 const APInt &DemandedElts,
18168 unsigned Depth)
const {
18170 unsigned Opc =
Op.getOpcode();
18173 unsigned IID =
Op.getConstantOperandVal(0);
18175 case Intrinsic::amdgcn_mbcnt_lo:
18176 case Intrinsic::amdgcn_mbcnt_hi: {
18182 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18192 Op, Known, DemandedElts, DAG,
Depth);
18208 unsigned MaxValue =
18215 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18219 unsigned Src1Cst = 0;
18220 if (Src1.
isImm()) {
18221 Src1Cst = Src1.
getImm();
18222 }
else if (Src1.
isReg()) {
18226 Src1Cst = Cst->Value.getZExtValue();
18237 if (Width >= BFEWidth)
18246 Known = Known.
sext(BFEWidth);
18248 Known = Known.
zext(BFEWidth);
18254 unsigned Depth)
const {
18257 switch (
MI->getOpcode()) {
18258 case AMDGPU::S_BFE_I32:
18261 case AMDGPU::S_BFE_U32:
18264 case AMDGPU::S_BFE_I64:
18267 case AMDGPU::S_BFE_U64:
18270 case AMDGPU::G_INTRINSIC:
18271 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18274 case Intrinsic::amdgcn_workitem_id_x:
18277 case Intrinsic::amdgcn_workitem_id_y:
18280 case Intrinsic::amdgcn_workitem_id_z:
18283 case Intrinsic::amdgcn_mbcnt_lo:
18284 case Intrinsic::amdgcn_mbcnt_hi: {
18296 case Intrinsic::amdgcn_groupstaticsize: {
18307 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18310 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18313 case AMDGPU::G_AMDGPU_SMED3:
18314 case AMDGPU::G_AMDGPU_UMED3: {
18315 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18342 unsigned Depth)
const {
18349 AttributeList Attrs =
18351 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18378 if (Header->getAlignment() != PrefAlign)
18379 return Header->getAlignment();
18381 unsigned LoopSize = 0;
18386 LoopSize +=
MBB->getAlignment().value() / 2;
18389 LoopSize +=
TII->getInstSizeInBytes(
MI);
18390 if (LoopSize > 192)
18395 if (LoopSize <= 64)
18398 if (LoopSize <= 128)
18399 return CacheLineAlign;
18405 auto I = Exit->getFirstNonDebugInstr();
18406 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18407 return CacheLineAlign;
18416 if (PreTerm == Pre->
begin() ||
18417 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18421 auto ExitHead = Exit->getFirstNonDebugInstr();
18422 if (ExitHead == Exit->end() ||
18423 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18428 return CacheLineAlign;
18436 N =
N->getOperand(0).getNode();
18446 switch (
N->getOpcode()) {
18454 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18455 return !
TRI->isSGPRReg(
MRI, Reg);
18461 return !
TRI->isSGPRReg(
MRI, Reg);
18465 unsigned AS = L->getAddressSpace();
18475 case AMDGPUISD::ATOMIC_CMP_SWAP:
18476 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18477 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18478 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18479 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18480 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18481 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18482 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18483 case AMDGPUISD::BUFFER_ATOMIC_AND:
18484 case AMDGPUISD::BUFFER_ATOMIC_OR:
18485 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18486 case AMDGPUISD::BUFFER_ATOMIC_INC:
18487 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18488 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18489 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18490 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18491 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18497 return A->readMem() &&
A->writeMem();
18518 switch (Ty.getScalarSizeInBits()) {
18530 const APInt &DemandedElts,
18533 unsigned Depth)
const {
18534 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
18538 if (Info->getMode().DX10Clamp)
18550 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18570 <<
"Hardware instruction generated for atomic "
18572 <<
" operation at memory scope " << MemScope;
18577 Type *EltTy = VT->getElementType();
18578 return VT->getNumElements() == 2 &&
18598 unsigned BW =
IT->getBitWidth();
18599 return BW == 32 || BW == 64;
18613 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18614 return BW == 32 || BW == 64;
18617 if (Ty->isFloatTy() || Ty->isDoubleTy())
18621 return VT->getNumElements() == 2 &&
18622 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18632 bool HasSystemScope) {
18639 if (HasSystemScope) {
18648 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18661 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18687 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18700 bool HasSystemScope =
18732 if (!
IT ||
IT->getBitWidth() != 32)
18738 if (Subtarget->hasEmulatedSystemScopeAtomics())
18754 if (!HasSystemScope &&
18755 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18767 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18775 ConstVal && ConstVal->isNullValue())
18813 if (Ty->isFloatTy()) {
18818 if (Ty->isDoubleTy()) {
18839 if (Ty->isFloatTy() &&
18840 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18853 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18857 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18861 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18866 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18871 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18875 if (Ty->isFloatTy()) {
18878 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18881 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18886 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18894 if (Subtarget->hasFlatAtomicFaddF32Inst())
18903 if (Subtarget->hasLDSFPAtomicAddF32()) {
18904 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18906 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18934 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18936 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18940 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18942 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18996 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18997 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18998 : &AMDGPU::SReg_32RegClass;
18999 if (!
TRI->isSGPRClass(RC) && !isDivergent)
19000 return TRI->getEquivalentSGPRClass(RC);
19001 if (
TRI->isSGPRClass(RC) && isDivergent) {
19002 if (Subtarget->hasGFX90AInsts())
19003 return TRI->getEquivalentAVClass(RC);
19004 return TRI->getEquivalentVGPRClass(RC);
19017 unsigned WaveSize) {
19022 if (!
IT ||
IT->getBitWidth() != WaveSize)
19027 if (!Visited.
insert(V).second)
19029 bool Result =
false;
19030 for (
const auto *U : V->users()) {
19032 if (V == U->getOperand(1)) {
19037 case Intrinsic::amdgcn_if_break:
19038 case Intrinsic::amdgcn_if:
19039 case Intrinsic::amdgcn_else:
19044 if (V == U->getOperand(0)) {
19049 case Intrinsic::amdgcn_end_cf:
19050 case Intrinsic::amdgcn_loop:
19056 Result =
hasCFUser(U, Visited, WaveSize);
19065 const Value *V)
const {
19067 if (CI->isInlineAsm()) {
19076 for (
auto &TC : TargetConstraints) {
19090 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19118 return MRI.hasOneNonDBGUse(N0);
19125 if (
I.getMetadata(
"amdgpu.noclobber"))
19127 if (
I.getMetadata(
"amdgpu.last.use"))
19191 Alignment = RMW->getAlign();
19204 bool FullFlatEmulation =
19206 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19207 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19208 RMW->getType()->isDoubleTy()));
19211 bool ReturnValueIsUsed = !AI->
use_empty();
19220 if (FullFlatEmulation) {
19231 std::prev(BB->
end())->eraseFromParent();
19232 Builder.SetInsertPoint(BB);
19234 Value *LoadedShared =
nullptr;
19235 if (FullFlatEmulation) {
19236 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19237 {Addr},
nullptr,
"is.shared");
19238 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19239 Builder.SetInsertPoint(SharedBB);
19240 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19246 LoadedShared = Clone;
19248 Builder.CreateBr(PhiBB);
19249 Builder.SetInsertPoint(CheckPrivateBB);
19252 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19253 {Addr},
nullptr,
"is.private");
19254 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19256 Builder.SetInsertPoint(PrivateBB);
19258 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19261 Value *LoadedPrivate;
19263 LoadedPrivate = Builder.CreateAlignedLoad(
19264 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19267 LoadedPrivate, RMW->getValOperand());
19269 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19271 auto [ResultLoad, Equal] =
19277 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19280 Builder.CreateBr(PhiBB);
19282 Builder.SetInsertPoint(GlobalBB);
19286 if (FullFlatEmulation) {
19287 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19296 if (!FullFlatEmulation) {
19301 MDNode *RangeNotPrivate =
19304 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19308 Builder.CreateBr(PhiBB);
19310 Builder.SetInsertPoint(PhiBB);
19312 if (ReturnValueIsUsed) {
19315 if (FullFlatEmulation)
19316 Loaded->addIncoming(LoadedShared, SharedBB);
19317 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19318 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19319 Loaded->takeName(AI);
19322 Builder.CreateBr(ExitBB);
19326 unsigned PtrOpIdx) {
19327 Value *PtrOp =
I->getOperand(PtrOpIdx);
19334 I->setOperand(PtrOpIdx, ASCast);
19346 ConstVal && ConstVal->isNullValue()) {
19376 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19384 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19399 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
AMDGPUArgumentUsageInfo & getArgUsageInfo()
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)