40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
273 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
280 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
281 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
282 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
285 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
286 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
287 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
291 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
292 MVT::v3i16, MVT::v4i16, MVT::Other},
297 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
313 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
314 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
315 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
316 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
317 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
318 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
319 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
320 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
352 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
366 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
380 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
394 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
408 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
423 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
432 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
433 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
438 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
442 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
443 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
444 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
445 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
526 {MVT::f32, MVT::f64},
Legal);
619 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
620 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
621 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
799 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
861 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
862 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
867 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
868 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
869 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
870 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
874 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
875 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
876 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
877 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
966 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
979 EVT DestVT,
EVT SrcVT)
const {
989 LLT DestTy,
LLT SrcTy)
const {
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
991 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1017 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1019 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1046 return (NumElts + 1) / 2;
1052 return NumElts * ((
Size + 31) / 32);
1061 EVT VT,
EVT &IntermediateVT,
1062 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1071 if (ScalarVT == MVT::bf16) {
1072 RegisterVT = MVT::i32;
1073 IntermediateVT = MVT::v2bf16;
1075 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1076 IntermediateVT = RegisterVT;
1078 NumIntermediates = (NumElts + 1) / 2;
1079 return NumIntermediates;
1084 IntermediateVT = RegisterVT;
1085 NumIntermediates = NumElts;
1086 return NumIntermediates;
1089 if (Size < 16 && Subtarget->has16BitInsts()) {
1091 RegisterVT = MVT::i16;
1092 IntermediateVT = ScalarVT;
1093 NumIntermediates = NumElts;
1094 return NumIntermediates;
1099 RegisterVT = MVT::i32;
1100 IntermediateVT = ScalarVT;
1101 NumIntermediates = NumElts;
1102 return NumIntermediates;
1106 RegisterVT = MVT::i32;
1107 IntermediateVT = RegisterVT;
1108 NumIntermediates = NumElts * ((
Size + 31) / 32);
1109 return NumIntermediates;
1114 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1119 unsigned MaxNumLanes) {
1120 assert(MaxNumLanes != 0);
1123 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1135 unsigned MaxNumLanes) {
1136 auto *ST = dyn_cast<StructType>(Ty);
1141 assert(ST->getNumContainedTypes() == 2 &&
1142 ST->getContainedType(1)->isIntegerTy(32));
1157 DL.getPointerSizeInBits(AS) == 192)
1167 DL.getPointerSizeInBits(AS) == 160) ||
1169 DL.getPointerSizeInBits(AS) == 192))
1177 unsigned IntrID)
const {
1179 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1194 if (RsrcIntr->IsImage) {
1202 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1209 Info.ptrVal = RsrcArg;
1217 if (RsrcIntr->IsImage) {
1218 unsigned MaxNumLanes = 4;
1233 std::numeric_limits<unsigned>::max());
1243 if (RsrcIntr->IsImage) {
1244 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1262 if (RsrcIntr->IsImage && BaseOpcode->
NoReturn) {
1264 Info.memVT = MVT::i32;
1271 case Intrinsic::amdgcn_raw_buffer_load_lds:
1272 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1273 case Intrinsic::amdgcn_struct_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1275 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1280 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1281 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1282 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1283 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1286 std::numeric_limits<unsigned>::max());
1296 case Intrinsic::amdgcn_ds_ordered_add:
1297 case Intrinsic::amdgcn_ds_ordered_swap: {
1310 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1311 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1314 Info.ptrVal =
nullptr;
1319 case Intrinsic::amdgcn_ds_append:
1320 case Intrinsic::amdgcn_ds_consume: {
1333 case Intrinsic::amdgcn_global_atomic_csub: {
1343 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1353 case Intrinsic::amdgcn_global_atomic_fadd:
1354 case Intrinsic::amdgcn_global_atomic_fmin:
1355 case Intrinsic::amdgcn_global_atomic_fmax:
1356 case Intrinsic::amdgcn_global_atomic_fmin_num:
1357 case Intrinsic::amdgcn_global_atomic_fmax_num:
1358 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1359 case Intrinsic::amdgcn_flat_atomic_fadd:
1360 case Intrinsic::amdgcn_flat_atomic_fmin:
1361 case Intrinsic::amdgcn_flat_atomic_fmax:
1362 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1363 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1364 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1365 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1366 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1377 case Intrinsic::amdgcn_global_load_tr_b64:
1378 case Intrinsic::amdgcn_global_load_tr_b128: {
1386 case Intrinsic::amdgcn_ds_gws_init:
1387 case Intrinsic::amdgcn_ds_gws_barrier:
1388 case Intrinsic::amdgcn_ds_gws_sema_v:
1389 case Intrinsic::amdgcn_ds_gws_sema_br:
1390 case Intrinsic::amdgcn_ds_gws_sema_p:
1391 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1401 Info.memVT = MVT::i32;
1405 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1411 case Intrinsic::amdgcn_global_load_lds: {
1413 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1419 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1429 Info.memVT = MVT::i32;
1444 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1447 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1448 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1460 Type *&AccessTy)
const {
1462 switch (
II->getIntrinsicID()) {
1463 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1464 case Intrinsic::amdgcn_ds_append:
1465 case Intrinsic::amdgcn_ds_consume:
1466 case Intrinsic::amdgcn_ds_ordered_add:
1467 case Intrinsic::amdgcn_ds_ordered_swap:
1468 case Intrinsic::amdgcn_flat_atomic_fadd:
1469 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1470 case Intrinsic::amdgcn_flat_atomic_fmax:
1471 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1472 case Intrinsic::amdgcn_flat_atomic_fmin:
1473 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1474 case Intrinsic::amdgcn_global_atomic_csub:
1475 case Intrinsic::amdgcn_global_atomic_fadd:
1476 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1477 case Intrinsic::amdgcn_global_atomic_fmax:
1478 case Intrinsic::amdgcn_global_atomic_fmax_num:
1479 case Intrinsic::amdgcn_global_atomic_fmin:
1480 case Intrinsic::amdgcn_global_atomic_fmin_num:
1481 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1482 case Intrinsic::amdgcn_global_load_tr_b64:
1483 case Intrinsic::amdgcn_global_load_tr_b128:
1484 Ptr =
II->getArgOperand(0);
1486 case Intrinsic::amdgcn_global_load_lds:
1487 Ptr =
II->getArgOperand(1);
1492 AccessTy =
II->getType();
1498 unsigned AddrSpace)
const {
1510 return AM.
Scale == 0 &&
1512 AM.
BaseOffs, AddrSpace, FlatVariant));
1532 return isLegalMUBUFAddressingMode(AM);
1535bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1546 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1558 if (AM.HasBaseReg) {
1589 return isLegalMUBUFAddressingMode(AM);
1596 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1646 : isLegalMUBUFAddressingMode(AM);
1693 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1707 Alignment < RequiredAlignment)
1728 RequiredAlignment =
Align(4);
1746 *IsFast = (Alignment >= RequiredAlignment) ? 64
1747 : (Alignment <
Align(4)) ? 32
1769 *IsFast = (Alignment >= RequiredAlignment) ? 96
1770 : (Alignment <
Align(4)) ? 32
1783 RequiredAlignment =
Align(8);
1794 *IsFast = (Alignment >= RequiredAlignment) ? 128
1795 : (Alignment <
Align(4)) ? 32
1812 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1814 return Alignment >= RequiredAlignment ||
1819 bool AlignedBy4 = Alignment >=
Align(4);
1821 *IsFast = AlignedBy4;
1823 return AlignedBy4 ||
1833 bool AlignedBy4 = Alignment >=
Align(4);
1835 *IsFast = AlignedBy4;
1846 return Alignment >=
Align(4) ||
1860 return Size >= 32 && Alignment >=
Align(4);
1865 unsigned *IsFast)
const {
1867 Alignment, Flags, IsFast);
1877 if (
Op.size() >= 16 &&
1881 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1889 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1899 unsigned DestAS)
const {
1907 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1911 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1931 unsigned Index)
const {
1978 std::tie(InputPtrReg, RC, ArgTy) =
1988 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1994 const SDLoc &SL)
const {
2001 const SDLoc &SL)
const {
2004 std::optional<uint32_t> KnownSize =
2006 if (KnownSize.has_value())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2042SDValue SITargetLowering::lowerKernargMemParameter(
2054 int64_t OffsetDiff =
Offset - AlignDownOffset;
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2081 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2128 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2157 Reg = &WorkGroupIDX;
2158 RC = &AMDGPU::SReg_32RegClass;
2162 Reg = &WorkGroupIDY;
2163 RC = &AMDGPU::SReg_32RegClass;
2167 Reg = &WorkGroupIDZ;
2168 RC = &AMDGPU::SReg_32RegClass;
2199 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2203 "vector type argument should have been split");
2208 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2217 "unexpected vector split in ps argument type");
2231 Info->markPSInputAllocated(PSInputNum);
2233 Info->markPSInputEnabled(PSInputNum);
2250 if (
Info.hasWorkItemIDX()) {
2256 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2260 if (
Info.hasWorkItemIDY()) {
2266 unsigned Reg = AMDGPU::VGPR1;
2274 if (
Info.hasWorkItemIDZ()) {
2280 unsigned Reg = AMDGPU::VGPR2;
2300 if (RegIdx == ArgVGPRs.
size()) {
2307 unsigned Reg = ArgVGPRs[RegIdx];
2309 assert(Reg != AMDGPU::NoRegister);
2319 unsigned NumArgRegs) {
2322 if (RegIdx == ArgSGPRs.
size())
2325 unsigned Reg = ArgSGPRs[RegIdx];
2327 assert(Reg != AMDGPU::NoRegister);
2341 assert(Reg != AMDGPU::NoRegister);
2367 const unsigned Mask = 0x3ff;
2370 if (
Info.hasWorkItemIDX()) {
2372 Info.setWorkItemIDX(Arg);
2375 if (
Info.hasWorkItemIDY()) {
2377 Info.setWorkItemIDY(Arg);
2380 if (
Info.hasWorkItemIDZ())
2392 const unsigned Mask = 0x3ff;
2417 if (
Info.hasImplicitArgPtr())
2425 if (
Info.hasWorkGroupIDX())
2428 if (
Info.hasWorkGroupIDY())
2431 if (
Info.hasWorkGroupIDZ())
2434 if (
Info.hasLDSKernelId())
2446 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2453 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2459 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2467 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2482 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2488 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2494 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2509 unsigned LastExplicitArgOffset =
2512 bool InPreloadSequence =
true;
2514 for (
auto &Arg :
F.args()) {
2515 if (!InPreloadSequence || !Arg.hasInRegAttr())
2518 int ArgIdx = Arg.getArgNo();
2521 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2522 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2525 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2526 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2528 assert(ArgLocs[ArgIdx].isMemLoc());
2529 auto &ArgLoc = ArgLocs[InIdx];
2531 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2533 unsigned NumAllocSGPRs =
2534 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2537 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2538 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2539 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2543 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2544 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2546 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2548 InPreloadSequence =
false;
2554 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2556 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2558 if (PreloadRegs->
size() > 1)
2559 RC = &AMDGPU::SGPR_32RegClass;
2560 for (
auto &Reg : *PreloadRegs) {
2566 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2575 if (
Info.hasLDSKernelId()) {
2577 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2587 bool IsShader)
const {
2595 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2597 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2601 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2602 Info.hasWorkGroupIDY() +
2603 Info.hasWorkGroupIDZ() +
2604 Info.hasWorkGroupInfo();
2605 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2607 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2612 if (!HasArchitectedSGPRs) {
2613 if (
Info.hasWorkGroupIDX()) {
2615 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2619 if (
Info.hasWorkGroupIDY()) {
2621 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2625 if (
Info.hasWorkGroupIDZ()) {
2627 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2632 if (
Info.hasWorkGroupInfo()) {
2634 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2638 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2640 unsigned PrivateSegmentWaveByteOffsetReg;
2643 PrivateSegmentWaveByteOffsetReg =
2644 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2648 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2650 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2653 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2655 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2656 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2660 Info.getNumPreloadedSGPRs() >= 16);
2675 if (HasStackObjects)
2676 Info.setHasNonSpillStackObjects(
true);
2681 HasStackObjects =
true;
2685 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2687 if (!ST.enableFlatScratch()) {
2688 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2695 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2697 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2707 Info.setScratchRSrcReg(ReservedBufferReg);
2726 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2727 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2734 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2735 if (!
MRI.isLiveIn(Reg)) {
2736 Info.setStackPtrOffsetReg(Reg);
2741 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2748 if (ST.getFrameLowering()->hasFP(MF)) {
2749 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2755 return !
Info->isEntryFunction();
2767 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2776 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2777 RC = &AMDGPU::SGPR_64RegClass;
2778 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2779 RC = &AMDGPU::SGPR_32RegClass;
2785 Entry->addLiveIn(*
I);
2790 for (
auto *Exit : Exits)
2792 TII->get(TargetOpcode::COPY), *
I)
2810 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2829 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2830 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2838 !
Info->hasWorkGroupIDZ());
2857 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2858 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2861 Info->markPSInputAllocated(0);
2862 Info->markPSInputEnabled(0);
2873 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2874 if ((PsInputBits & 0x7F) == 0 ||
2875 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2878 }
else if (IsKernel) {
2881 Splits.
append(Ins.begin(), Ins.end());
2894 }
else if (!IsGraphics) {
2919 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2929 if (IsEntryFunc && VA.
isMemLoc()) {
2952 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2956 int64_t OffsetDiff =
Offset - AlignDownOffset;
2963 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2974 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2975 Ins[i].Flags.isSExt(), &Ins[i]);
2983 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2986 if (PreloadRegs.
size() == 1) {
2987 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2992 TRI->getRegSizeInBits(*RC)));
3000 for (
auto Reg : PreloadRegs) {
3007 PreloadRegs.size()),
3024 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3025 Ins[i].Flags.isSExt(), &Ins[i]);
3030 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3031 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3036 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3050 if (!IsEntryFunc && VA.
isMemLoc()) {
3051 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3062 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3063 RC = &AMDGPU::VGPR_32RegClass;
3064 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3065 RC = &AMDGPU::SGPR_32RegClass;
3126 Info->setBytesInStackArgArea(StackArgSize);
3128 return Chains.
empty() ? Chain :
3146 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3152 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3153 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3154 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3177 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3195 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3196 ++
I, ++RealRVLocIdx) {
3200 SDValue Arg = OutVals[RealRVLocIdx];
3228 if (!
Info->isEntryFunction()) {
3234 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3236 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3252 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3337 auto &ArgUsageInfo =
3339 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3369 std::tie(OutgoingArg, ArgRC, ArgTy) =
3377 std::tie(IncomingArg, IncomingArgRC, Ty) =
3379 assert(IncomingArgRC == ArgRC);
3382 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3390 InputReg = getImplicitArgPtr(DAG,
DL);
3392 std::optional<uint32_t> Id =
3394 if (Id.has_value()) {
3406 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3410 unsigned SpecialArgOffset =
3424 std::tie(OutgoingArg, ArgRC, Ty) =
3427 std::tie(OutgoingArg, ArgRC, Ty) =
3430 std::tie(OutgoingArg, ArgRC, Ty) =
3445 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3446 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3447 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3464 InputReg = InputReg.
getNode() ?
3473 InputReg = InputReg.
getNode() ?
3477 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3478 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3488 IncomingArgX ? *IncomingArgX :
3489 IncomingArgY ? *IncomingArgY :
3490 *IncomingArgZ, ~0u);
3497 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3538 if (Callee->isDivergent())
3545 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3549 if (!CallerPreserved)
3552 bool CCMatch = CallerCC == CalleeCC;
3565 if (Arg.hasByValAttr())
3579 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3580 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3589 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3623 if (IsChainCallConv) {
3627 RequestedExec = CLI.
Args.back();
3628 assert(RequestedExec.
Node &&
"No node for EXEC");
3633 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3634 CLI.
Outs.pop_back();
3638 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3639 CLI.
Outs.pop_back();
3644 "Haven't popped all the pieces of the EXEC mask");
3655 bool IsSibCall =
false;
3669 "unsupported call to variadic function ");
3677 "unsupported required tail call to function ");
3682 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3686 "site marked musttail or on llvm.amdgcn.cs.chain");
3693 if (!TailCallOpt && IsTailCall)
3738 if (!IsSibCall || IsChainCallConv) {
3745 RegsToPass.emplace_back(IsChainCallConv
3746 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3747 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3754 MVT PtrVT = MVT::i32;
3757 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3785 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3793 int32_t
Offset = LocMemOffset;
3800 unsigned OpSize = Flags.isByVal() ?
3806 ? Flags.getNonZeroByValAlign()
3833 if (Outs[i].Flags.isByVal()) {
3835 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3838 Outs[i].Flags.getNonZeroByValAlign(),
3840 nullptr, std::nullopt, DstInfo,
3846 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3852 if (!MemOpChains.
empty())
3858 for (
auto &RegToPass : RegsToPass) {
3860 RegToPass.second, InGlue);
3869 if (IsTailCall && !IsSibCall) {
3874 std::vector<SDValue> Ops;
3875 Ops.push_back(Chain);
3876 Ops.push_back(Callee);
3893 if (IsChainCallConv)
3894 Ops.push_back(RequestedExec.
Node);
3898 for (
auto &RegToPass : RegsToPass) {
3900 RegToPass.second.getValueType()));
3905 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3906 assert(Mask &&
"Missing call preserved mask for calling convention");
3916 MVT::Glue, GlueOps),
3921 Ops.push_back(InGlue);
3940 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3945 Chain = Call.getValue(0);
3946 InGlue = Call.getValue(1);
3948 uint64_t CalleePopBytes = NumBytes;
3967 EVT VT =
Op.getValueType();
3982 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3993 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3994 if (Alignment && *Alignment > StackAlign) {
4015 if (isa<ConstantSDNode>(
Size))
4022 if (
Op.getValueType() != MVT::i32)
4041 assert(
Op.getValueType() == MVT::i32);
4050 Op.getOperand(0), IntrinID, GetRoundBothImm);
4084 SDValue RoundModeTimesNumBits =
4104 TableEntry, EnumOffset);
4118 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4120 static_cast<uint32_t>(ConstMode->getZExtValue()),
4132 if (UseReducedTable) {
4138 SDValue RoundModeTimesNumBits =
4158 SDValue RoundModeTimesNumBits =
4167 NewMode = TruncTable;
4176 ReadFirstLaneID, NewMode);
4189 IntrinID, RoundBothImm, NewMode);
4195 if (
Op->isDivergent())
4198 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4214 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4215 EVT SrcVT = Src.getValueType();
4224 EVT DstVT =
Op.getValueType();
4233 if (
Op.getValueType() != MVT::i64)
4247 Op.getOperand(0), IntrinID, ModeHwRegImm);
4249 Op.getOperand(0), IntrinID, TrapHwRegImm);
4263 if (
Op.getOperand(1).getValueType() != MVT::i64)
4275 ReadFirstLaneID, NewModeReg);
4277 ReadFirstLaneID, NewTrapReg);
4279 unsigned ModeHwReg =
4282 unsigned TrapHwReg =
4290 IntrinID, ModeHwRegImm, NewModeReg);
4293 IntrinID, TrapHwRegImm, NewTrapReg);
4300 .
Case(
"m0", AMDGPU::M0)
4301 .
Case(
"exec", AMDGPU::EXEC)
4302 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4303 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4304 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4305 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4306 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4309 if (Reg == AMDGPU::NoRegister) {
4323 case AMDGPU::EXEC_LO:
4324 case AMDGPU::EXEC_HI:
4325 case AMDGPU::FLAT_SCR_LO:
4326 case AMDGPU::FLAT_SCR_HI:
4331 case AMDGPU::FLAT_SCR:
4350 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4359static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4381 auto Next = std::next(
I);
4394 return std::pair(LoopBB, RemainderBB);
4401 auto I =
MI.getIterator();
4402 auto E = std::next(
I);
4424 Src->setIsKill(
false);
4440 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4443 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4465 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4466 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4475 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4476 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4477 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4478 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4486 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4493 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4497 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4502 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4503 : AMDGPU::S_AND_SAVEEXEC_B64),
4507 MRI.setSimpleHint(NewExec, CondReg);
4509 if (UseGPRIdxMode) {
4511 SGPRIdxReg = CurrentIdxReg;
4513 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4514 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4521 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4524 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4531 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4533 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4534 : AMDGPU::S_XOR_B64_term), Exec)
4555 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4556 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4564 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4566 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4567 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4568 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4569 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4584 InitResultReg, DstReg, PhiReg, TmpExec,
4585 Offset, UseGPRIdxMode, SGPRIdxReg);
4602static std::pair<unsigned, int>
4607 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4612 return std::pair(AMDGPU::sub0,
Offset);
4626 assert(
Idx->getReg() != AMDGPU::NoRegister);
4647 return Idx->getReg();
4649 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4666 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4667 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4676 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4679 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4683 if (UseGPRIdxMode) {
4690 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4703 MI.eraseFromParent();
4712 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4713 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4719 UseGPRIdxMode, SGPRIdxReg);
4723 if (UseGPRIdxMode) {
4725 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4727 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4732 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4737 MI.eraseFromParent();
4754 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4765 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4767 if (
Idx->getReg() == AMDGPU::NoRegister) {
4778 MI.eraseFromParent();
4783 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4787 if (UseGPRIdxMode) {
4791 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4800 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4801 TRI.getRegSizeInBits(*VecRC), 32,
false);
4807 MI.eraseFromParent();
4817 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4821 UseGPRIdxMode, SGPRIdxReg);
4824 if (UseGPRIdxMode) {
4826 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4828 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4834 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4835 TRI.getRegSizeInBits(*VecRC), 32,
false);
4836 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4842 MI.eraseFromParent();
4857 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4885 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4886 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4888 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4889 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4890 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4892 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4893 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4895 bool IsWave32 = ST.isWave32();
4896 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4897 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4902 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4905 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4910 I = ComputeLoop->end();
4912 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4916 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4917 .
addReg(TmpSReg->getOperand(0).getReg())
4921 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4922 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4923 .
addReg(ActiveBits->getOperand(0).getReg());
4924 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4925 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4927 .
addReg(FF1->getOperand(0).getReg());
4928 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4930 .
addReg(LaneValue->getOperand(0).getReg());
4933 unsigned BITSETOpc =
4934 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4935 auto NewActiveBits =
4936 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4937 .
addReg(FF1->getOperand(0).getReg())
4938 .
addReg(ActiveBits->getOperand(0).getReg());
4941 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4942 .addMBB(ComputeLoop);
4943 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4944 .addMBB(ComputeLoop);
4947 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4949 .
addReg(NewActiveBits->getOperand(0).getReg())
4951 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4956 MI.eraseFromParent();
4967 switch (
MI.getOpcode()) {
4968 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4970 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4972 case AMDGPU::S_UADDO_PSEUDO:
4973 case AMDGPU::S_USUBO_PSEUDO: {
4980 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4982 : AMDGPU::S_SUB_I32;
4989 MI.eraseFromParent();
4992 case AMDGPU::S_ADD_U64_PSEUDO:
4993 case AMDGPU::S_SUB_U64_PSEUDO: {
5002 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5004 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5012 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5013 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5016 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5018 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5021 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5023 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5025 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5026 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5039 MI.eraseFromParent();
5042 case AMDGPU::V_ADD_U64_PSEUDO:
5043 case AMDGPU::V_SUB_U64_PSEUDO: {
5049 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5055 if (IsAdd && ST.hasLshlAddB64()) {
5061 TII->legalizeOperands(*
Add);
5062 MI.eraseFromParent();
5066 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5068 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5069 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5071 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5072 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5076 : &AMDGPU::VReg_64RegClass;
5079 : &AMDGPU::VReg_64RegClass;
5082 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5084 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5087 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5089 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5092 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5094 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5096 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5103 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5117 TII->legalizeOperands(*LoHalf);
5118 TII->legalizeOperands(*HiHalf);
5119 MI.eraseFromParent();
5122 case AMDGPU::S_ADD_CO_PSEUDO:
5123 case AMDGPU::S_SUB_CO_PSEUDO: {
5137 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5138 ? AMDGPU::S_ADDC_U32
5139 : AMDGPU::S_SUBB_U32;
5141 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5142 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5147 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5148 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5152 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5154 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5160 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5161 assert(WaveSize == 64 || WaveSize == 32);
5163 if (WaveSize == 64) {
5164 if (ST.hasScalarCompareEq64()) {
5170 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5172 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5174 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5175 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5177 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5194 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5200 MI.eraseFromParent();
5203 case AMDGPU::SI_INIT_M0: {
5205 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5206 .
add(
MI.getOperand(0));
5207 MI.eraseFromParent();
5210 case AMDGPU::GET_GROUPSTATICSIZE: {
5215 .
add(
MI.getOperand(0))
5217 MI.eraseFromParent();
5220 case AMDGPU::GET_SHADERCYCLESHILO: {
5234 using namespace AMDGPU::Hwreg;
5235 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5237 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5238 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5241 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5243 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5247 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5252 .
add(
MI.getOperand(0))
5257 MI.eraseFromParent();
5260 case AMDGPU::SI_INDIRECT_SRC_V1:
5261 case AMDGPU::SI_INDIRECT_SRC_V2:
5262 case AMDGPU::SI_INDIRECT_SRC_V4:
5263 case AMDGPU::SI_INDIRECT_SRC_V8:
5264 case AMDGPU::SI_INDIRECT_SRC_V9:
5265 case AMDGPU::SI_INDIRECT_SRC_V10:
5266 case AMDGPU::SI_INDIRECT_SRC_V11:
5267 case AMDGPU::SI_INDIRECT_SRC_V12:
5268 case AMDGPU::SI_INDIRECT_SRC_V16:
5269 case AMDGPU::SI_INDIRECT_SRC_V32:
5271 case AMDGPU::SI_INDIRECT_DST_V1:
5272 case AMDGPU::SI_INDIRECT_DST_V2:
5273 case AMDGPU::SI_INDIRECT_DST_V4:
5274 case AMDGPU::SI_INDIRECT_DST_V8:
5275 case AMDGPU::SI_INDIRECT_DST_V9:
5276 case AMDGPU::SI_INDIRECT_DST_V10:
5277 case AMDGPU::SI_INDIRECT_DST_V11:
5278 case AMDGPU::SI_INDIRECT_DST_V12:
5279 case AMDGPU::SI_INDIRECT_DST_V16:
5280 case AMDGPU::SI_INDIRECT_DST_V32:
5282 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5283 case AMDGPU::SI_KILL_I1_PSEUDO:
5285 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5294 Register SrcCond =
MI.getOperand(3).getReg();
5296 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5297 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5298 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5299 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5303 : &AMDGPU::VReg_64RegClass;
5306 : &AMDGPU::VReg_64RegClass;
5309 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5311 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5314 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5316 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5319 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5321 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5343 MI.eraseFromParent();
5346 case AMDGPU::SI_BR_UNDEF: {
5350 .
add(
MI.getOperand(0));
5352 MI.eraseFromParent();
5355 case AMDGPU::ADJCALLSTACKUP:
5356 case AMDGPU::ADJCALLSTACKDOWN: {
5363 case AMDGPU::SI_CALL_ISEL: {
5367 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5370 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5376 MI.eraseFromParent();
5379 case AMDGPU::V_ADD_CO_U32_e32:
5380 case AMDGPU::V_SUB_CO_U32_e32:
5381 case AMDGPU::V_SUBREV_CO_U32_e32: {
5384 unsigned Opc =
MI.getOpcode();
5386 bool NeedClampOperand =
false;
5387 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5389 NeedClampOperand =
true;
5393 if (
TII->isVOP3(*
I)) {
5398 I.add(
MI.getOperand(1))
5399 .add(
MI.getOperand(2));
5400 if (NeedClampOperand)
5403 TII->legalizeOperands(*
I);
5405 MI.eraseFromParent();
5408 case AMDGPU::V_ADDC_U32_e32:
5409 case AMDGPU::V_SUBB_U32_e32:
5410 case AMDGPU::V_SUBBREV_U32_e32:
5413 TII->legalizeOperands(
MI);
5415 case AMDGPU::DS_GWS_INIT:
5416 case AMDGPU::DS_GWS_SEMA_BR:
5417 case AMDGPU::DS_GWS_BARRIER:
5418 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5420 case AMDGPU::DS_GWS_SEMA_V:
5421 case AMDGPU::DS_GWS_SEMA_P:
5422 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5430 case AMDGPU::S_SETREG_B32: {
5445 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5446 const unsigned SetMask = WidthMask <<
Offset;
5449 unsigned SetDenormOp = 0;
5450 unsigned SetRoundOp = 0;
5458 SetRoundOp = AMDGPU::S_ROUND_MODE;
5459 SetDenormOp = AMDGPU::S_DENORM_MODE;
5461 SetRoundOp = AMDGPU::S_ROUND_MODE;
5463 SetDenormOp = AMDGPU::S_DENORM_MODE;
5466 if (SetRoundOp || SetDenormOp) {
5469 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5470 unsigned ImmVal = Def->getOperand(1).getImm();
5484 MI.eraseFromParent();
5493 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5497 case AMDGPU::S_INVERSE_BALLOT_U32:
5498 case AMDGPU::S_INVERSE_BALLOT_U64:
5501 MI.setDesc(
TII->get(AMDGPU::COPY));
5503 case AMDGPU::ENDPGM_TRAP: {
5506 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5524 MI.eraseFromParent();
5527 case AMDGPU::SIMULATED_TRAP: {
5531 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5532 MI.eraseFromParent();
5569 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5656 EVT VT =
N->getValueType(0);
5660 if (VT == MVT::f16) {
5676 unsigned Opc =
Op.getOpcode();
5677 EVT VT =
Op.getValueType();
5678 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5679 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5680 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5681 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5699 unsigned Opc =
Op.getOpcode();
5700 EVT VT =
Op.getValueType();
5701 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5702 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5703 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5704 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5723 unsigned Opc =
Op.getOpcode();
5724 EVT VT =
Op.getValueType();
5725 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5726 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5727 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5728 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5729 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5730 VT == MVT::v32bf16);
5736 : std::pair(Op0, Op0);
5755 switch (
Op.getOpcode()) {
5761 assert((!Result.getNode() ||
5762 Result.getNode()->getNumValues() == 2) &&
5763 "Load should return a value and a chain");
5767 EVT VT =
Op.getValueType();
5769 return lowerFSQRTF32(
Op, DAG);
5771 return lowerFSQRTF64(
Op, DAG);
5776 return LowerTrig(
Op, DAG);
5785 return LowerGlobalAddress(MFI,
Op, DAG);
5792 return lowerINSERT_SUBVECTOR(
Op, DAG);
5794 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5796 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5798 return lowerVECTOR_SHUFFLE(
Op, DAG);
5800 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5802 return lowerBUILD_VECTOR(
Op, DAG);
5805 return lowerFP_ROUND(
Op, DAG);
5810 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
5814 int RoundMode =
Op.getConstantOperandVal(1);
5822 return DAG.
getNode(Opc,
DL,
Op.getNode()->getVTList(),
Op->getOperand(0));
5825 return lowerTRAP(
Op, DAG);
5827 return lowerDEBUGTRAP(
Op, DAG);
5836 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5839 return lowerFLDEXP(
Op, DAG);
5866 return lowerMUL(
Op, DAG);
5869 return lowerXMULO(
Op, DAG);
5872 return lowerXMUL_LOHI(
Op, DAG);
5905 EVT FittingLoadVT = LoadVT;
5937SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5941 bool IsIntrinsic)
const {
5945 EVT LoadVT =
M->getValueType(0);
5947 EVT EquivLoadVT = LoadVT;
5966 VTList, Ops,
M->getMemoryVT(),
5967 M->getMemOperand());
5978 EVT LoadVT =
M->getValueType(0);
5984 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5985 bool IsTFE =
M->getNumValues() == 3;
5998 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand(),
6002 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
6003 M->getMemOperand(), DAG);
6008 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
6009 M->getMemOperand(), DAG);
6017 EVT VT =
N->getValueType(0);
6018 unsigned CondCode =
N->getConstantOperandVal(3);
6029 EVT CmpVT =
LHS.getValueType();
6030 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6051 EVT VT =
N->getValueType(0);
6053 unsigned CondCode =
N->getConstantOperandVal(3);
6062 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6080 EVT VT =
N->getValueType(0);
6087 Src.getOperand(1), Src.getOperand(2));
6098 Exec = AMDGPU::EXEC_LO;
6100 Exec = AMDGPU::EXEC;
6117 EVT VT =
N->getValueType(0);
6119 unsigned IID =
N->getConstantOperandVal(0);
6120 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6121 IID == Intrinsic::amdgcn_permlanex16;
6129 case Intrinsic::amdgcn_permlane16:
6130 case Intrinsic::amdgcn_permlanex16:
6135 case Intrinsic::amdgcn_writelane:
6138 case Intrinsic::amdgcn_readlane:
6141 case Intrinsic::amdgcn_readfirstlane:
6142 case Intrinsic::amdgcn_permlane64:
6152 if (
SDNode *GL =
N->getGluedNode()) {
6154 GL = GL->getOperand(0).getNode();
6164 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6166 Src1 =
N->getOperand(2);
6167 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6168 Src2 =
N->getOperand(3);
6171 if (ValSize == 32) {
6186 if (IID == Intrinsic::amdgcn_writelane) {
6191 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6193 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
6196 if (ValSize % 32 != 0)
6200 EVT VT =
N->getValueType(0);
6204 unsigned NumOperands =
N->getNumOperands();
6206 SDNode *GL =
N->getGluedNode();
6211 for (
unsigned i = 0; i != NE; ++i) {
6212 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6214 SDValue Operand =
N->getOperand(j);
6245 return unrollLaneOp(LaneOp.
getNode());
6252 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6253 for (
unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6261 if (IID == Intrinsic::amdgcn_writelane)
6267 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6268 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6285 if (IID == Intrinsic::amdgcn_writelane)
6288 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6296 switch (
N->getOpcode()) {
6308 unsigned IID =
N->getConstantOperandVal(0);
6310 case Intrinsic::amdgcn_make_buffer_rsrc:
6311 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6313 case Intrinsic::amdgcn_cvt_pkrtz: {
6322 case Intrinsic::amdgcn_cvt_pknorm_i16:
6323 case Intrinsic::amdgcn_cvt_pknorm_u16:
6324 case Intrinsic::amdgcn_cvt_pk_i16:
6325 case Intrinsic::amdgcn_cvt_pk_u16: {
6331 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6333 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6335 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6340 EVT VT =
N->getValueType(0);
6349 case Intrinsic::amdgcn_s_buffer_load: {
6361 EVT VT =
Op.getValueType();
6362 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6374 if (!
Offset->isDivergent()) {
6393 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6405 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6406 Results.push_back(Res.getOperand(
I));
6410 Results.push_back(Res.getValue(1));
6419 EVT VT =
N->getValueType(0);
6424 EVT SelectVT = NewVT;
6425 if (NewVT.
bitsLT(MVT::i32)) {
6428 SelectVT = MVT::i32;
6434 if (NewVT != SelectVT)
6440 if (
N->getValueType(0) != MVT::v2f16)
6453 if (
N->getValueType(0) != MVT::v2f16)
6466 if (
N->getValueType(0) != MVT::f16)
6484 if (
I.getUse().get() !=
Value)
6487 if (
I->getOpcode() == Opcode)
6493unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6495 switch (
Intr->getConstantOperandVal(1)) {
6496 case Intrinsic::amdgcn_if:
6498 case Intrinsic::amdgcn_else:
6500 case Intrinsic::amdgcn_loop:
6502 case Intrinsic::amdgcn_end_cf:
6550 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6563 assert(BR &&
"brcond missing unconditional branch user");
6564 Target = BR->getOperand(1);
6567 unsigned CFNode = isCFIntrinsic(
Intr);
6586 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6616 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6633 Intr->getOperand(0));
6640 MVT VT =
Op.getSimpleValueType();
6643 if (
Op.getConstantOperandVal(0) != 0)
6649 if (
Info->isEntryFunction())
6667 return Op.getValueType().bitsLE(VT) ?
6674 assert(
Op.getValueType() == MVT::f16 &&
6675 "Do not know how to custom lower FP_ROUND for non-f16 type");
6678 EVT SrcVT = Src.getValueType();
6679 if (SrcVT != MVT::f64)
6695 EVT VT =
Op.getValueType();
6698 bool IsIEEEMode =
Info->getMode().IEEE;
6707 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6715 EVT VT =
Op.getValueType();
6719 EVT ExpVT =
Exp.getValueType();
6720 if (ExpVT == MVT::i16)
6741 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6749 EVT VT =
Op.getValueType();
6755 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6782 if (
Op->isDivergent())
6795 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6797 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6800 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6802 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6808 EVT VT =
Op.getValueType();
6815 const APInt &
C = RHSC->getAPIntValue();
6817 if (
C.isPowerOf2()) {
6819 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6824 SL, VT, Result, ShiftAmt),
6844 if (
Op->isDivergent()) {
6861 return lowerTrapEndpgm(
Op, DAG);
6864 lowerTrapHsaQueuePtr(
Op, DAG);
6867SDValue SITargetLowering::lowerTrapEndpgm(
6875 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6885SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6895 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6901 if (UserSGPR == AMDGPU::NoRegister) {
6926SDValue SITargetLowering::lowerTrapHsa(
6952 "debugtrap handler not supported",
6968SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6972 ? AMDGPU::SRC_SHARED_BASE
6973 : AMDGPU::SRC_PRIVATE_BASE;
6996 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7005 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
7011 if (UserSGPR == AMDGPU::NoRegister) {
7018 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7041 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
7042 isa<BasicBlockSDNode>(Val))
7045 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7046 return ConstVal->getSExtValue() !=
TM.getNullPointerValue(AddrSpace);
7060 unsigned DestAS, SrcAS;
7062 bool IsNonNull =
false;
7063 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
7064 SrcAS = ASC->getSrcAddressSpace();
7065 Src = ASC->getOperand(0);
7066 DestAS = ASC->getDestAddressSpace();
7069 Op.getConstantOperandVal(0) ==
7070 Intrinsic::amdgcn_addrspacecast_nonnull);
7071 Src =
Op->getOperand(1);
7072 SrcAS =
Op->getConstantOperandVal(2);
7073 DestAS =
Op->getConstantOperandVal(3);
7088 unsigned NullVal =
TM.getNullPointerValue(DestAS);
7102 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7110 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
7122 Op.getValueType() == MVT::i64) {
7131 Src.getValueType() == MVT::i64)
7155 EVT InsVT =
Ins.getValueType();
7158 unsigned IdxVal =
Idx->getAsZExtVal();
7163 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
7168 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7170 MVT::i32, InsNumElts / 2);
7175 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
7177 if (InsNumElts == 2) {
7190 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7212 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
7213 if (NumElts == 4 && EltSize == 16 && KIdx) {
7224 unsigned Idx = KIdx->getZExtValue();
7225 bool InsertLo =
Idx < 2;
7227 InsertLo ? LoVec : HiVec,
7242 if (isa<ConstantSDNode>(
Idx))
7248 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7254 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7270 DAG.
getNOT(SL, BFM, IntVT), BCVec);
7282 EVT ResultVT =
Op.getValueType();
7295 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7298 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7303 if (VecSize == 128) {
7311 }
else if (VecSize == 256) {
7314 for (
unsigned P = 0;
P < 4; ++
P) {
7320 Parts[0], Parts[1]));
7322 Parts[2], Parts[3]));
7328 for (
unsigned P = 0;
P < 8; ++
P) {
7335 Parts[0], Parts[1], Parts[2], Parts[3]));
7338 Parts[4], Parts[5],Parts[6], Parts[7]));
7341 EVT IdxVT =
Idx.getValueType();
7358 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7373 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7383 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7389 EVT ResultVT =
Op.getValueType();
7392 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7394 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7410 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7411 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7419 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7420 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7421 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7422 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7441 EVT ResultVT =
Op.getValueType();
7457 EVT VT =
Op.getValueType();
7459 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7460 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7479 { CastLo, CastHi });
7483 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7490 for (
unsigned P = 0;
P < 4; ++
P)
7491 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7494 for (
unsigned P = 0;
P < 4; ++
P) {
7504 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7511 for (
unsigned P = 0;
P < 8; ++
P)
7512 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7515 for (
unsigned P = 0;
P < 8; ++
P) {
7525 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7577 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7615 EVT PtrVT =
Op.getValueType();
7631 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7704 SDValue Param = lowerKernargMemParameter(
7714 "non-hsa intrinsic with hsa target",
7723 "intrinsic not supported on subtarget",
7733 unsigned NumElts = Elts.
size();
7735 if (NumElts <= 12) {
7744 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7750 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7751 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7760 EVT SrcVT = Src.getValueType();
7781 bool Unpacked,
bool IsD16,
int DMaskPop,
7782 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7785 EVT ReqRetVT = ResultTypes[0];
7787 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7788 ? (ReqRetNumElts + 1) / 2
7791 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7793 MVT DataDwordVT = NumDataDwords == 1 ?
7796 MVT MaskPopVT = MaskPopDwords == 1 ?
7802 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7813 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7815 NumDataDwords - MaskPopDwords);
7820 EVT LegalReqRetVT = ReqRetVT;
7822 if (!
Data.getValueType().isInteger())
7824 Data.getValueType().changeTypeToInteger(),
Data);
7845 if (Result->getNumValues() == 1)
7852 SDValue *LWE,
bool &IsTexFail) {
7853 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7872 unsigned DimIdx,
unsigned EndIdx,
7873 unsigned NumGradients) {
7875 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7883 if (((
I + 1) >= EndIdx) ||
7884 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7885 I == DimIdx + NumGradients - 1))) {
7886 if (
Addr.getValueType() != MVT::i16)
7907 unsigned IntrOpcode =
Intr->BaseOpcode;
7918 int NumVDataDwords = 0;
7919 bool AdjustRetType =
false;
7920 bool IsAtomicPacked16Bit =
false;
7923 const unsigned ArgOffset = WithChain ? 2 : 1;
7926 unsigned DMaskLanes = 0;
7928 if (BaseOpcode->Atomic) {
7929 VData =
Op.getOperand(2);
7931 IsAtomicPacked16Bit =
7932 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7933 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7936 if (BaseOpcode->AtomicX2) {
7943 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7944 DMask = Is64Bit ? 0xf : 0x3;
7945 NumVDataDwords = Is64Bit ? 4 : 2;
7947 DMask = Is64Bit ? 0x3 : 0x1;
7948 NumVDataDwords = Is64Bit ? 2 : 1;
7951 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7954 if (BaseOpcode->Store) {
7955 VData =
Op.getOperand(2);
7963 VData = handleD16VData(VData, DAG,
true);
7967 }
else if (!BaseOpcode->NoReturn) {
7980 (!LoadVT.
isVector() && DMaskLanes > 1))
7988 NumVDataDwords = (DMaskLanes + 1) / 2;
7990 NumVDataDwords = DMaskLanes;
7992 AdjustRetType =
true;
7996 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
8001 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
8003 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8004 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8006 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
8008 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8009 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8012 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
8013 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
8014 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
8019 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
8023 "Bias needs to be converted to 16 bit in A16 mode");
8028 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
8032 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
8033 "require 16 bit args for both gradients and addresses");
8038 if (!
ST->hasA16()) {
8039 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
8040 "support 16 bit addresses\n");
8050 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
8054 IntrOpcode = G16MappingInfo->
G16;
8062 ArgOffset +
Intr->GradientStart,
8063 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
8065 for (
unsigned I = ArgOffset +
Intr->GradientStart;
8066 I < ArgOffset + Intr->CoordStart;
I++)
8073 ArgOffset +
Intr->CoordStart, VAddrEnd,
8077 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
8095 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
8096 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
8097 const bool UseNSA =
ST->hasNSAEncoding() &&
8098 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
8099 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
8100 const bool UsePartialNSA =
8101 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
8104 if (UsePartialNSA) {
8106 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8115 if (!BaseOpcode->Sampler) {
8119 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
8121 Unorm = UnormConst ? True : False;
8126 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
8127 bool IsTexFail =
false;
8128 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8139 NumVDataDwords += 1;
8140 AdjustRetType =
true;
8145 if (AdjustRetType) {
8147 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8150 if (isa<MemSDNode>(
Op))
8155 EVT NewVT = NumVDataDwords > 1 ?
8159 ResultTypes[0] = NewVT;
8160 if (ResultTypes.size() == 3) {
8164 ResultTypes.erase(&ResultTypes[1]);
8168 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
8169 if (BaseOpcode->Atomic)
8176 if (BaseOpcode->Store || BaseOpcode->Atomic)
8178 if (UsePartialNSA) {
8187 if (BaseOpcode->Sampler)
8192 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8196 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8204 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8208 if (BaseOpcode->HasD16)
8210 if (isa<MemSDNode>(
Op))
8213 int NumVAddrDwords =
8219 NumVDataDwords, NumVAddrDwords);
8220 }
else if (IsGFX11Plus) {
8222 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8223 : AMDGPU::MIMGEncGfx11Default,
8224 NumVDataDwords, NumVAddrDwords);
8225 }
else if (IsGFX10Plus) {
8227 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8228 : AMDGPU::MIMGEncGfx10Default,
8229 NumVDataDwords, NumVAddrDwords);
8233 NumVDataDwords, NumVAddrDwords);
8236 "requested image instruction is not supported on this GPU");
8241 NumVDataDwords, NumVAddrDwords);
8244 NumVDataDwords, NumVAddrDwords);
8250 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
8255 if (BaseOpcode->AtomicX2) {
8260 if (BaseOpcode->NoReturn)
8264 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8282 if (!
Offset->isDivergent()) {
8327 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8331 unsigned NumLoads = 1;
8337 if (NumElts == 8 || NumElts == 16) {
8338 NumLoads = NumElts / 4;
8346 setBufferOffsets(
Offset, DAG, &Ops[3],
8347 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8350 for (
unsigned i = 0; i < NumLoads; ++i) {
8356 if (NumElts == 8 || NumElts == 16)
8403 EVT VT =
Op.getValueType();
8405 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8409 switch (IntrinsicID) {
8410 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8413 return getPreloadedValue(DAG, *MFI, VT,
8416 case Intrinsic::amdgcn_dispatch_ptr:
8417 case Intrinsic::amdgcn_queue_ptr: {
8420 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8426 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8428 return getPreloadedValue(DAG, *MFI, VT, RegID);
8430 case Intrinsic::amdgcn_implicitarg_ptr: {
8432 return getImplicitArgPtr(DAG,
DL);
8433 return getPreloadedValue(DAG, *MFI, VT,
8436 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8442 return getPreloadedValue(DAG, *MFI, VT,
8445 case Intrinsic::amdgcn_dispatch_id: {
8448 case Intrinsic::amdgcn_rcp:
8450 case Intrinsic::amdgcn_rsq:
8452 case Intrinsic::amdgcn_rsq_legacy:
8456 case Intrinsic::amdgcn_rcp_legacy:
8460 case Intrinsic::amdgcn_rsq_clamp: {
8474 case Intrinsic::r600_read_ngroups_x:
8478 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8481 case Intrinsic::r600_read_ngroups_y:
8485 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8488 case Intrinsic::r600_read_ngroups_z:
8492 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8495 case Intrinsic::r600_read_global_size_x:
8499 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8502 case Intrinsic::r600_read_global_size_y:
8506 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8509 case Intrinsic::r600_read_global_size_z:
8513 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8516 case Intrinsic::r600_read_local_size_x:
8520 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8522 case Intrinsic::r600_read_local_size_y:
8526 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8528 case Intrinsic::r600_read_local_size_z:
8532 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8534 case Intrinsic::amdgcn_workgroup_id_x:
8535 return getPreloadedValue(DAG, *MFI, VT,
8537 case Intrinsic::amdgcn_workgroup_id_y:
8538 return getPreloadedValue(DAG, *MFI, VT,
8540 case Intrinsic::amdgcn_workgroup_id_z:
8541 return getPreloadedValue(DAG, *MFI, VT,
8543 case Intrinsic::amdgcn_wave_id:
8544 return lowerWaveID(DAG,
Op);
8545 case Intrinsic::amdgcn_lds_kernel_id: {
8547 return getLDSKernelId(DAG,
DL);
8548 return getPreloadedValue(DAG, *MFI, VT,
8551 case Intrinsic::amdgcn_workitem_id_x:
8552 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8553 case Intrinsic::amdgcn_workitem_id_y:
8554 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8555 case Intrinsic::amdgcn_workitem_id_z:
8556 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8557 case Intrinsic::amdgcn_wavefrontsize:
8560 case Intrinsic::amdgcn_s_buffer_load: {
8561 unsigned CPol =
Op.getConstantOperandVal(3);
8568 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8571 case Intrinsic::amdgcn_fdiv_fast:
8572 return lowerFDIV_FAST(
Op, DAG);
8573 case Intrinsic::amdgcn_sin:
8576 case Intrinsic::amdgcn_cos:
8579 case Intrinsic::amdgcn_mul_u24:
8581 case Intrinsic::amdgcn_mul_i24:
8584 case Intrinsic::amdgcn_log_clamp: {
8590 case Intrinsic::amdgcn_fract:
8593 case Intrinsic::amdgcn_class:
8595 Op.getOperand(1),
Op.getOperand(2));
8596 case Intrinsic::amdgcn_div_fmas:
8598 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8601 case Intrinsic::amdgcn_div_fixup:
8603 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8605 case Intrinsic::amdgcn_div_scale: {
8618 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8621 Denominator, Numerator);
8623 case Intrinsic::amdgcn_icmp: {
8625 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8626 Op.getConstantOperandVal(2) == 0 &&
8631 case Intrinsic::amdgcn_fcmp: {
8634 case Intrinsic::amdgcn_ballot:
8636 case Intrinsic::amdgcn_fmed3:
8638 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8639 case Intrinsic::amdgcn_fdot2:
8641 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8643 case Intrinsic::amdgcn_fmul_legacy:
8645 Op.getOperand(1),
Op.getOperand(2));
8646 case Intrinsic::amdgcn_sffbh:
8648 case Intrinsic::amdgcn_sbfe:
8650 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8651 case Intrinsic::amdgcn_ubfe:
8653 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8654 case Intrinsic::amdgcn_cvt_pkrtz:
8655 case Intrinsic::amdgcn_cvt_pknorm_i16:
8656 case Intrinsic::amdgcn_cvt_pknorm_u16:
8657 case Intrinsic::amdgcn_cvt_pk_i16:
8658 case Intrinsic::amdgcn_cvt_pk_u16: {
8660 EVT VT =
Op.getValueType();
8663 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8665 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8667 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8669 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8675 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8678 Op.getOperand(1),
Op.getOperand(2));
8681 case Intrinsic::amdgcn_fmad_ftz:
8683 Op.getOperand(2),
Op.getOperand(3));
8685 case Intrinsic::amdgcn_if_break:
8687 Op->getOperand(1),
Op->getOperand(2)), 0);
8689 case Intrinsic::amdgcn_groupstaticsize: {
8701 case Intrinsic::amdgcn_is_shared:
8702 case Intrinsic::amdgcn_is_private: {
8704 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8706 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8714 case Intrinsic::amdgcn_perm:
8716 Op.getOperand(2),
Op.getOperand(3));
8717 case Intrinsic::amdgcn_reloc_constant: {
8721 auto RelocSymbol = cast<GlobalVariable>(
8727 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8728 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8729 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8730 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8731 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8732 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8733 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8734 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8735 if (
Op.getOperand(4).getValueType() == MVT::i32)
8741 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8742 Op.getOperand(3), IndexKeyi32);
8744 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8745 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8746 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8747 if (
Op.getOperand(6).getValueType() == MVT::i32)
8753 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8754 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8755 IndexKeyi32, Op.getOperand(7)});
8757 case Intrinsic::amdgcn_addrspacecast_nonnull:
8758 return lowerADDRSPACECAST(
Op, DAG);
8759 case Intrinsic::amdgcn_readlane:
8760 case Intrinsic::amdgcn_readfirstlane:
8761 case Intrinsic::amdgcn_writelane:
8762 case Intrinsic::amdgcn_permlane16:
8763 case Intrinsic::amdgcn_permlanex16:
8764 case Intrinsic::amdgcn_permlane64:
8769 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8780 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8786 unsigned NewOpcode)
const {
8790 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8791 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8805 auto *
M = cast<MemSDNode>(
Op);
8809 M->getMemOperand());
8814 unsigned NewOpcode)
const {
8818 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8819 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8833 auto *
M = cast<MemSDNode>(
Op);
8837 M->getMemOperand());
8842 unsigned IntrID =
Op.getConstantOperandVal(1);
8846 case Intrinsic::amdgcn_ds_ordered_add:
8847 case Intrinsic::amdgcn_ds_ordered_swap: {
8852 unsigned IndexOperand =
M->getConstantOperandVal(7);
8853 unsigned WaveRelease =
M->getConstantOperandVal(8);
8854 unsigned WaveDone =
M->getConstantOperandVal(9);
8856 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8857 IndexOperand &= ~0x3f;
8858 unsigned CountDw = 0;
8861 CountDw = (IndexOperand >> 24) & 0xf;
8862 IndexOperand &= ~(0xf << 24);
8864 if (CountDw < 1 || CountDw > 4) {
8866 "ds_ordered_count: dword count must be between 1 and 4");
8873 if (WaveDone && !WaveRelease)
8876 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8877 unsigned ShaderType =
8879 unsigned Offset0 = OrderedCountIndex << 2;
8880 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8883 Offset1 |= (CountDw - 1) << 6;
8886 Offset1 |= ShaderType << 2;
8888 unsigned Offset = Offset0 | (Offset1 << 8);
8897 M->getVTList(), Ops,
M->getMemoryVT(),
8898 M->getMemOperand());
8900 case Intrinsic::amdgcn_raw_buffer_load:
8901 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8902 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8903 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8904 case Intrinsic::amdgcn_raw_buffer_load_format:
8905 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8906 const bool IsFormat =
8907 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8908 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8910 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8911 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8924 auto *
M = cast<MemSDNode>(
Op);
8925 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8927 case Intrinsic::amdgcn_struct_buffer_load:
8928 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8929 case Intrinsic::amdgcn_struct_buffer_load_format:
8930 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8931 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8932 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
8933 const bool IsFormat =
8934 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8935 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8937 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8938 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8951 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8953 case Intrinsic::amdgcn_raw_tbuffer_load:
8954 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8956 EVT LoadVT =
Op.getValueType();
8957 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8958 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8977 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8980 case Intrinsic::amdgcn_struct_tbuffer_load:
8981 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8983 EVT LoadVT =
Op.getValueType();
8984 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8985 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9004 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
9007 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9008 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9010 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9011 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9013 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9014 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9016 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9017 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9019 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9020 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9022 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9023 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9025 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9026 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9028 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9031 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9032 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9034 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9037 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9038 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9040 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9043 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9044 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9046 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9049 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9050 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9052 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9055 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9056 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9058 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9061 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9062 return lowerRawBufferAtomicIntrin(
Op, DAG,
9064 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9065 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9066 return lowerStructBufferAtomicIntrin(
Op, DAG,
9068 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9069 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9071 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9072 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9074 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9076 return lowerStructBufferAtomicIntrin(
Op, DAG,
9078 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9080 return lowerStructBufferAtomicIntrin(
Op, DAG,
9082 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9084 return lowerStructBufferAtomicIntrin(
Op, DAG,
9086 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9087 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9088 return lowerStructBufferAtomicIntrin(
Op, DAG,
9090 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9091 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9093 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9094 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9096 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9097 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9099 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9100 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9102 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9105 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9106 return lowerStructBufferAtomicIntrin(
Op, DAG,
9109 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9110 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9111 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9112 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9126 EVT VT =
Op.getValueType();
9127 auto *
M = cast<MemSDNode>(
Op);
9130 Op->getVTList(), Ops, VT,
M->getMemOperand());
9132 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9134 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9135 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
9149 EVT VT =
Op.getValueType();
9150 auto *
M = cast<MemSDNode>(
Op);
9153 Op->getVTList(), Ops, VT,
M->getMemOperand());
9155 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9157 SDValue NodePtr =
M->getOperand(2);
9158 SDValue RayExtent =
M->getOperand(3);
9159 SDValue RayOrigin =
M->getOperand(4);
9161 SDValue RayInvDir =
M->getOperand(6);
9179 const unsigned NumVDataDwords = 4;
9180 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9181 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9185 const unsigned BaseOpcodes[2][2] = {
9186 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9187 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9188 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9192 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9193 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9194 : AMDGPU::MIMGEncGfx10NSA,
9195 NumVDataDwords, NumVAddrDwords);
9199 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9200 : AMDGPU::MIMGEncGfx10Default,
9201 NumVDataDwords, NumVAddrDwords);
9207 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
9210 if (Lanes[0].getValueSizeInBits() == 32) {
9211 for (
unsigned I = 0;
I < 3; ++
I)
9218 { Lanes[0], Lanes[1] })));
9225 { Elt0, Lanes[0] })));
9229 { Lanes[1], Lanes[2] })));
9234 if (UseNSA && IsGFX11Plus) {
9242 for (
unsigned I = 0;
I < 3; ++
I) {
9245 {DirLanes[I], InvDirLanes[I]})));
9260 packLanes(RayOrigin,
true);
9261 packLanes(RayDir,
true);
9262 packLanes(RayInvDir,
false);
9267 if (NumVAddrDwords > 12) {
9287 case Intrinsic::amdgcn_global_atomic_fmin:
9288 case Intrinsic::amdgcn_global_atomic_fmax:
9289 case Intrinsic::amdgcn_global_atomic_fmin_num:
9290 case Intrinsic::amdgcn_global_atomic_fmax_num:
9291 case Intrinsic::amdgcn_flat_atomic_fmin:
9292 case Intrinsic::amdgcn_flat_atomic_fmax:
9293 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9294 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9301 unsigned Opcode = 0;
9303 case Intrinsic::amdgcn_global_atomic_fmin:
9304 case Intrinsic::amdgcn_global_atomic_fmin_num:
9305 case Intrinsic::amdgcn_flat_atomic_fmin:
9306 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9310 case Intrinsic::amdgcn_global_atomic_fmax:
9311 case Intrinsic::amdgcn_global_atomic_fmax_num:
9312 case Intrinsic::amdgcn_flat_atomic_fmax:
9313 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9321 Ops,
M->getMemOperand());
9323 case Intrinsic::amdgcn_s_get_barrier_state: {
9327 bool IsInlinableBarID =
false;
9330 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9331 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9335 if (IsInlinableBarID) {
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9340 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9352 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9360SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9370 bool IsTFE = VTList.
NumVTs == 3;
9373 unsigned NumOpDWords = NumValueDWords + 1;
9378 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9379 OpDWordsVT, OpDWordsMMO, DAG);
9394 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9400 WidenedMemVT, WidenedMMO);
9410 bool ImageStore)
const {
9445 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9451 if ((NumElements % 2) == 1) {
9453 unsigned I = Elts.
size() / 2;
9469 if (NumElements == 3) {
9490 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9493 switch (IntrinsicID) {
9494 case Intrinsic::amdgcn_exp_compr: {
9498 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9521 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9524 case Intrinsic::amdgcn_s_barrier: {
9527 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9528 if (WGSize <=
ST.getWavefrontSize())
9530 Op.getOperand(0)), 0);
9534 if (
ST.hasSplitBarriers()) {
9539 MVT::Other, K,
Op.getOperand(0)),
9551 case Intrinsic::amdgcn_struct_tbuffer_store:
9552 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9556 VData = handleD16VData(VData, DAG);
9557 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9558 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9576 M->getMemoryVT(),
M->getMemOperand());
9579 case Intrinsic::amdgcn_raw_tbuffer_store:
9580 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9584 VData = handleD16VData(VData, DAG);
9585 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9586 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9604 M->getMemoryVT(),
M->getMemOperand());
9607 case Intrinsic::amdgcn_raw_buffer_store:
9608 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9609 case Intrinsic::amdgcn_raw_buffer_store_format:
9610 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9611 const bool IsFormat =
9612 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9613 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9620 VData = handleD16VData(VData, DAG);
9630 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9631 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9651 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9654 M->getMemoryVT(),
M->getMemOperand());
9657 case Intrinsic::amdgcn_struct_buffer_store:
9658 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9659 case Intrinsic::amdgcn_struct_buffer_store_format:
9660 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9661 const bool IsFormat =
9662 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9663 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9671 VData = handleD16VData(VData, DAG);
9681 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9682 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9703 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9706 M->getMemoryVT(),
M->getMemOperand());
9708 case Intrinsic::amdgcn_raw_buffer_load_lds:
9709 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9710 case Intrinsic::amdgcn_struct_buffer_load_lds:
9711 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9715 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9716 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9717 unsigned OpOffset = HasVIndex ? 1 : 0;
9718 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9720 unsigned Size =
Op->getConstantOperandVal(4);
9726 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9727 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9728 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9729 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9732 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9733 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9734 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9735 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9738 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9739 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9740 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9741 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9749 if (HasVIndex && HasVOffset)
9755 else if (HasVOffset)
9758 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9762 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9770 auto *
M = cast<MemSDNode>(
Op);
9797 case Intrinsic::amdgcn_global_load_lds: {
9799 unsigned Size =
Op->getConstantOperandVal(4);
9804 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9807 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9810 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9814 auto *
M = cast<MemSDNode>(
Op);
9827 if (
LHS->isDivergent())
9831 RHS.getOperand(0).getValueType() == MVT::i32) {
9834 VOffset =
RHS.getOperand(0);
9839 if (!
Addr->isDivergent()) {
9855 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9875 case Intrinsic::amdgcn_end_cf:
9877 Op->getOperand(2), Chain), 0);
9878 case Intrinsic::amdgcn_s_barrier_init:
9879 case Intrinsic::amdgcn_s_barrier_join:
9880 case Intrinsic::amdgcn_s_wakeup_barrier: {
9885 bool IsInlinableBarID =
false;
9888 if (isa<ConstantSDNode>(BarOp)) {
9889 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9893 if (IsInlinableBarID) {
9894 switch (IntrinsicID) {
9897 case Intrinsic::amdgcn_s_barrier_init:
9898 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9900 case Intrinsic::amdgcn_s_barrier_join:
9901 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9903 case Intrinsic::amdgcn_s_wakeup_barrier:
9904 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9911 switch (IntrinsicID) {
9914 case Intrinsic::amdgcn_s_barrier_init:
9915 Opc = AMDGPU::S_BARRIER_INIT_M0;
9917 case Intrinsic::amdgcn_s_barrier_join:
9918 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9920 case Intrinsic::amdgcn_s_wakeup_barrier:
9921 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9926 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9932 if (!IsInlinableBarID) {
9937 Op.getOperand(2), M0Val),
9941 }
else if (!IsInlinableBarID) {
9951 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9964std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9971 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9988 unsigned Overflow = ImmOffset & ~MaxImm;
9989 ImmOffset -= Overflow;
9990 if ((int32_t)Overflow < 0) {
9991 Overflow += ImmOffset;
10000 SDValue Ops[] = { N0, OverflowVal };
10015void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10017 Align Alignment)
const {
10020 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10023 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10034 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10036 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10053SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10056 return MaybePointer;
10072 SDValue NumRecords =
Op->getOperand(3);
10075 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10078 std::optional<uint32_t> ConstStride = std::nullopt;
10079 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10080 ConstStride = ConstNode->getZExtValue();
10083 if (!ConstStride || *ConstStride != 0) {
10086 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10097 NewHighHalf, NumRecords, Flags);
10107 bool IsTFE)
const {
10117 SDValue Op = getMemIntrinsicNode(Opc,
DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10144 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10148 Ops[1] = BufferStoreExt;
10153 M->getMemOperand());
10178SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10194 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10201 "unexpected vector extload");
10214 "unexpected fp extload");
10232 DCI.AddToWorklist(Cvt.
getNode());
10237 DCI.AddToWorklist(Cvt.
getNode());
10248 if (
Info.isEntryFunction())
10249 return Info.getUserSGPRInfo().hasFlatScratchInit();
10257 EVT MemVT =
Load->getMemoryVT();
10270 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10273 BasePtr, RealMemVT, MMO);
10303 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10304 "Custom lowering for non-i32 vectors hasn't been implemented.");
10307 unsigned AS =
Load->getAddressSpace();
10326 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10343 Alignment >=
Align(4) && NumElements < 32) {
10358 if (NumElements > 4)
10378 if (NumElements > 2)
10383 if (NumElements > 4)
10395 auto Flags =
Load->getMemOperand()->getFlags();
10397 Load->getAlign(), Flags, &
Fast) &&
10406 MemVT, *
Load->getMemOperand())) {
10416 EVT VT =
Op.getValueType();
10453 EVT VT =
Op.getValueType();
10456 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10463 if (!AllowInaccurateRcp && VT != MVT::f16)
10466 if (CLHS->isExactlyValue(1.0)) {
10483 if (CLHS->isExactlyValue(-1.0)) {
10492 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10506 EVT VT =
Op.getValueType();
10509 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10511 if (!AllowInaccurateDiv)
10532 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10545 return DAG.
getNode(Opcode, SL, VTList,
10554 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10567 return DAG.
getNode(Opcode, SL, VTList,
10573 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10574 return FastLowered;
10601 const APFloat K0Val(0x1p+96f);
10604 const APFloat K1Val(0x1p-32f);
10631 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10632 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10633 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10638 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10639 return FastLowered;
10646 Flags.setNoFPExcept(
true);
10663 DenominatorScaled, Flags);
10665 DenominatorScaled, Flags);
10667 using namespace AMDGPU::Hwreg;
10668 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10676 const bool HasDynamicDenormals =
10682 if (!PreservesDenormals) {
10690 if (HasDynamicDenormals) {
10694 SavedDenormMode =
SDValue(GetReg, 0);
10702 const SDValue EnableDenormValue =
10711 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10712 {EnableDenormValue,
BitField, Glue});
10725 ApproxRcp, One, NegDivScale0, Flags);
10728 ApproxRcp, Fma0, Flags);
10731 Fma1, Fma1, Flags);
10734 NumeratorScaled,
Mul, Flags);
10737 Fma2, Fma1,
Mul, Fma2, Flags);
10740 NumeratorScaled, Fma3, Flags);
10742 if (!PreservesDenormals) {
10749 Fma4.
getValue(1), DisableDenormValue,
10752 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10753 const SDValue DisableDenormValue =
10754 HasDynamicDenormals
10759 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10770 {Fma4, Fma1, Fma3, Scale},
Flags);
10776 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10777 return FastLowered;
10805 NegDivScale0,
Mul, DivScale1);
10837 Fma4, Fma3,
Mul, Scale);
10843 EVT VT =
Op.getValueType();
10845 if (VT == MVT::f32)
10846 return LowerFDIV32(
Op, DAG);
10848 if (VT == MVT::f64)
10849 return LowerFDIV64(
Op, DAG);
10851 if (VT == MVT::f16)
10852 return LowerFDIV16(
Op, DAG);
10861 EVT ResultExpVT =
Op->getValueType(1);
10862 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10892 if (VT == MVT::i1) {
10895 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10899 Store->getValue().getValueType().getScalarType() == MVT::i32);
10901 unsigned AS =
Store->getAddressSpace();
10920 if (NumElements > 4)
10927 VT, *
Store->getMemOperand()))
10937 if (NumElements > 2)
10941 if (NumElements > 4 ||
10950 auto Flags =
Store->getMemOperand()->getFlags();
10985 MVT VT =
Op.getValueType().getSimpleVT();
11154 EVT VT =
Op.getValueType();
11171 switch (
Op.getOpcode()) {
11197 EVT VT =
Op.getValueType();
11213 DAGCombinerInfo &DCI)
const {
11214 EVT VT =
N->getValueType(0);
11216 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11223 EVT SrcVT = Src.getValueType();
11229 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11232 DCI.AddToWorklist(Cvt.
getNode());
11235 if (ScalarVT != MVT::f32) {
11247 DAGCombinerInfo &DCI)
const {
11248 SDValue MagnitudeOp =
N->getOperand(0);
11249 SDValue SignOp =
N->getOperand(1);
11307 unsigned AddrSpace,
11309 DAGCombinerInfo &DCI)
const {
11339 AM.HasBaseReg =
true;
11340 AM.BaseOffs =
Offset.getSExtValue();
11345 EVT VT =
N->getValueType(0);
11351 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11362 switch (
N->getOpcode()) {
11373 DAGCombinerInfo &DCI)
const {
11382 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11383 N->getMemoryVT(), DCI);
11387 NewOps[PtrIdx] = NewPtr;
11396 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11397 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11406SDValue SITargetLowering::splitBinaryBitConstantOp(
11407 DAGCombinerInfo &DCI,
11429 if (V.getValueType() != MVT::i1)
11431 switch (V.getOpcode()) {
11450 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11451 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11452 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11453 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11454 uint32_t NonZeroByteMask = ~ZeroByteMask;
11455 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11468 assert(V.getValueSizeInBits() == 32);
11470 if (V.getNumOperands() != 2)
11479 switch (V.getOpcode()) {
11484 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11489 return (0x03020100 & ~ConstMask) | ConstMask;
11496 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11502 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11509 DAGCombinerInfo &DCI)
const {
11510 if (DCI.isBeforeLegalize())
11514 EVT VT =
N->getValueType(0);
11520 if (VT == MVT::i64 && CRHS) {
11526 if (CRHS && VT == MVT::i32) {
11535 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11536 unsigned Shift = CShift->getZExtValue();
11538 unsigned Offset = NB + Shift;
11539 if ((
Offset & (Bits - 1)) == 0) {
11542 LHS->getOperand(0),
11557 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11563 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11578 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11583 if (
X !=
LHS.getOperand(1))
11621 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11622 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11625 Mask->getZExtValue() & ~OrdMask :
11626 Mask->getZExtValue() & OrdMask;
11634 if (VT == MVT::i32 &&
11647 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11650 if (LHSMask != ~0u && RHSMask != ~0u) {
11653 if (LHSMask > RHSMask) {
11660 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11661 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11664 if (!(LHSUsedLanes & RHSUsedLanes) &&
11667 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11674 for (
unsigned I = 0;
I < 32;
I += 8) {
11676 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11677 Mask &= (0x0c <<
I) & 0xffffffff;
11686 LHS.getOperand(0),
RHS.getOperand(0),
11735static const std::optional<ByteProvider<SDValue>>
11737 unsigned Depth = 0) {
11740 return std::nullopt;
11742 if (
Op.getValueSizeInBits() < 8)
11743 return std::nullopt;
11745 if (
Op.getValueType().isVector())
11748 switch (
Op->getOpcode()) {
11759 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11760 NarrowVT = VTSign->getVT();
11763 return std::nullopt;
11766 if (SrcIndex >= NarrowByteWidth)
11767 return std::nullopt;
11773 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11775 return std::nullopt;
11777 uint64_t BitShift = ShiftOp->getZExtValue();
11779 if (BitShift % 8 != 0)
11780 return std::nullopt;
11782 SrcIndex += BitShift / 8;
11800static const std::optional<ByteProvider<SDValue>>
11802 unsigned StartingIndex = 0) {
11806 return std::nullopt;
11808 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11810 return std::nullopt;
11812 return std::nullopt;
11814 bool IsVec =
Op.getValueType().isVector();
11815 switch (
Op.getOpcode()) {
11818 return std::nullopt;
11823 return std::nullopt;
11827 return std::nullopt;
11830 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11831 return std::nullopt;
11832 if (!
LHS ||
LHS->isConstantZero())
11834 if (!
RHS ||
RHS->isConstantZero())
11836 return std::nullopt;
11841 return std::nullopt;
11843 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11845 return std::nullopt;
11847 uint32_t BitMask = BitMaskOp->getZExtValue();
11851 if ((IndexMask & BitMask) != IndexMask) {
11854 if (IndexMask & BitMask)
11855 return std::nullopt;
11864 return std::nullopt;
11867 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11868 if (!ShiftOp ||
Op.getValueType().isVector())
11869 return std::nullopt;
11871 uint64_t BitsProvided =
Op.getValueSizeInBits();
11872 if (BitsProvided % 8 != 0)
11873 return std::nullopt;
11875 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11877 return std::nullopt;
11879 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11880 uint64_t ByteShift = BitShift / 8;
11882 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11883 uint64_t BytesProvided = BitsProvided / 8;
11884 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11885 NewIndex %= BytesProvided;
11892 return std::nullopt;
11894 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11896 return std::nullopt;
11898 uint64_t BitShift = ShiftOp->getZExtValue();
11900 return std::nullopt;
11902 auto BitsProvided =
Op.getScalarValueSizeInBits();
11903 if (BitsProvided % 8 != 0)
11904 return std::nullopt;
11906 uint64_t BytesProvided = BitsProvided / 8;
11907 uint64_t ByteShift = BitShift / 8;
11912 return BytesProvided - ByteShift >
Index
11920 return std::nullopt;
11922 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11924 return std::nullopt;
11926 uint64_t BitShift = ShiftOp->getZExtValue();
11927 if (BitShift % 8 != 0)
11928 return std::nullopt;
11929 uint64_t ByteShift = BitShift / 8;
11935 return Index < ByteShift
11938 Depth + 1, StartingIndex);
11947 return std::nullopt;
11954 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11955 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11957 if (NarrowBitWidth % 8 != 0)
11958 return std::nullopt;
11959 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11961 if (
Index >= NarrowByteWidth)
11963 ? std::optional<ByteProvider<SDValue>>(
11971 return std::nullopt;
11975 if (NarrowByteWidth >=
Index) {
11980 return std::nullopt;
11987 return std::nullopt;
11991 auto L = cast<LoadSDNode>(
Op.getNode());
11993 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11994 if (NarrowBitWidth % 8 != 0)
11995 return std::nullopt;
11996 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12001 if (
Index >= NarrowByteWidth) {
12003 ? std::optional<ByteProvider<SDValue>>(
12008 if (NarrowByteWidth >
Index) {
12012 return std::nullopt;
12017 return std::nullopt;
12020 Depth + 1, StartingIndex);
12024 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12026 return std::nullopt;
12027 auto VecIdx = IdxOp->getZExtValue();
12028 auto ScalarSize =
Op.getScalarValueSizeInBits();
12029 if (ScalarSize < 32)
12030 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
12032 StartingIndex,
Index);
12037 return std::nullopt;
12039 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12041 return std::nullopt;
12044 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
12045 if (IdxMask > 0x07 && IdxMask != 0x0c)
12046 return std::nullopt;
12048 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12049 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12051 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12057 return std::nullopt;
12072 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12076 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12079 auto MemVT = L->getMemoryVT();
12082 return L->getMemoryVT().getSizeInBits() == 16;
12092 int Low8 = Mask & 0xff;
12093 int Hi8 = (Mask & 0xff00) >> 8;
12095 assert(Low8 < 8 && Hi8 < 8);
12097 bool IsConsecutive = (Hi8 - Low8 == 1);
12102 bool Is16Aligned = !(Low8 % 2);
12104 return IsConsecutive && Is16Aligned;
12112 int Low16 = PermMask & 0xffff;
12113 int Hi16 = (PermMask & 0xffff0000) >> 16;
12123 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12125 if (!OtherOpIs16Bit)
12133 unsigned DWordOffset) {
12136 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12138 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12143 if (Src.getValueType().isVector()) {
12144 auto ScalarTySize = Src.getScalarValueSizeInBits();
12145 auto ScalarTy = Src.getValueType().getScalarType();
12146 if (ScalarTySize == 32) {
12150 if (ScalarTySize > 32) {
12153 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12154 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12161 assert(ScalarTySize < 32);
12162 auto NumElements =
TypeSize / ScalarTySize;
12163 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12164 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12165 auto NumElementsIn32 = 32 / ScalarTySize;
12166 auto NumAvailElements = DWordOffset < Trunc32Elements
12168 : NumElements - NormalizedTrunc;
12181 auto ShiftVal = 32 * DWordOffset;
12189 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12194 for (
int i = 0; i < 4; i++) {
12196 std::optional<ByteProvider<SDValue>>
P =
12199 if (!
P ||
P->isConstantZero())
12204 if (PermNodes.
size() != 4)
12207 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12208 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12210 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12211 auto PermOp = PermNodes[i];
12214 int SrcByteAdjust = 4;
12218 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12219 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12221 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12222 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12226 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12227 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12230 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12232 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12235 SDValue Op = *PermNodes[FirstSrc.first].Src;
12237 assert(
Op.getValueSizeInBits() == 32);
12241 int Low16 = PermMask & 0xffff;
12242 int Hi16 = (PermMask & 0xffff0000) >> 16;
12244 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12245 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12248 if (WellFormedLow && WellFormedHi)
12252 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12261 assert(
Op.getValueType().isByteSized() &&
12279 DAGCombinerInfo &DCI)
const {
12284 EVT VT =
N->getValueType(0);
12285 if (VT == MVT::i1) {
12290 if (Src !=
RHS.getOperand(0))
12295 if (!CLHS || !CRHS)
12299 static const uint32_t MaxMask = 0x3ff;
12313 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12318 Sel |=
LHS.getConstantOperandVal(2);
12327 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12331 auto usesCombinedOperand = [](
SDNode *OrUse) {
12334 !OrUse->getValueType(0).isVector())
12338 for (
auto VUse : OrUse->uses()) {
12339 if (!VUse->getValueType(0).isVector())
12346 if (VUse->getOpcode() == VectorwiseOp)
12352 if (!
any_of(
N->uses(), usesCombinedOperand))
12358 if (LHSMask != ~0u && RHSMask != ~0u) {
12361 if (LHSMask > RHSMask) {
12368 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12369 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12372 if (!(LHSUsedLanes & RHSUsedLanes) &&
12375 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12377 LHSMask &= ~RHSUsedLanes;
12378 RHSMask &= ~LHSUsedLanes;
12380 LHSMask |= LHSUsedLanes & 0x04040404;
12386 LHS.getOperand(0),
RHS.getOperand(0),
12390 if (LHSMask == ~0u || RHSMask == ~0u) {
12396 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12411 if (SrcVT == MVT::i32) {
12417 DCI.AddToWorklist(LowOr.
getNode());
12418 DCI.AddToWorklist(HiBits.
getNode());
12426 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12430 N->getOperand(0), CRHS))
12438 DAGCombinerInfo &DCI)
const {
12439 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12448 EVT VT =
N->getValueType(0);
12449 if (CRHS && VT == MVT::i64) {
12471 LHS->getOperand(0), FNegLHS, FNegRHS);
12480 DAGCombinerInfo &DCI)
const {
12485 EVT VT =
N->getValueType(0);
12486 if (VT != MVT::i32)
12490 if (Src.getValueType() != MVT::i16)
12497SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12498 DAGCombinerInfo &DCI)
const {
12500 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12505 VTSign->getVT() == MVT::i8) ||
12507 VTSign->getVT() == MVT::i16))) {
12509 "s_buffer_load_{u8, i8} are supported "
12510 "in GFX12 (or newer) architectures.");
12511 EVT VT = Src.getValueType();
12516 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12522 auto *
M = cast<MemSDNode>(Src);
12523 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12524 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12529 VTSign->getVT() == MVT::i8) ||
12531 VTSign->getVT() == MVT::i16)) &&
12533 auto *
M = cast<MemSDNode>(Src);
12545 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12546 Src.getOperand(0).getValueType());
12549 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12551 Ops,
M->getMemoryVT(),
12552 M->getMemOperand());
12553 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12560 DAGCombinerInfo &DCI)
const {
12568 if (
N->getOperand(0).isUndef())
12575 DAGCombinerInfo &DCI)
const {
12576 EVT VT =
N->getValueType(0);
12580 return DCI.DAG.getConstantFP(
12603 unsigned Opcode =
Op.getOpcode();
12607 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12608 const auto &
F = CFP->getValueAPF();
12609 if (
F.isNaN() &&
F.isSignaling())
12611 if (!
F.isDenormal())
12674 if (
Op.getValueType() == MVT::i32) {
12679 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12680 if (
RHS->getZExtValue() == 0xffff0000) {
12690 return Op.getValueType().getScalarType() != MVT::f16;
12758 if (
Op.getValueType() == MVT::i16) {
12769 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12771 switch (IntrinsicID) {
12772 case Intrinsic::amdgcn_cvt_pkrtz:
12773 case Intrinsic::amdgcn_cubeid:
12774 case Intrinsic::amdgcn_frexp_mant:
12775 case Intrinsic::amdgcn_fdot2:
12776 case Intrinsic::amdgcn_rcp:
12777 case Intrinsic::amdgcn_rsq:
12778 case Intrinsic::amdgcn_rsq_clamp:
12779 case Intrinsic::amdgcn_rcp_legacy:
12780 case Intrinsic::amdgcn_rsq_legacy:
12781 case Intrinsic::amdgcn_trig_preop:
12782 case Intrinsic::amdgcn_log:
12783 case Intrinsic::amdgcn_exp2:
12784 case Intrinsic::amdgcn_sqrt:
12805 unsigned Opcode =
MI->getOpcode();
12807 if (Opcode == AMDGPU::G_FCANONICALIZE)
12810 std::optional<FPValueAndVReg> FCR;
12813 if (FCR->Value.isSignaling())
12815 if (!FCR->Value.isDenormal())
12826 case AMDGPU::G_FADD:
12827 case AMDGPU::G_FSUB:
12828 case AMDGPU::G_FMUL:
12829 case AMDGPU::G_FCEIL:
12830 case AMDGPU::G_FFLOOR:
12831 case AMDGPU::G_FRINT:
12832 case AMDGPU::G_FNEARBYINT:
12833 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12834 case AMDGPU::G_INTRINSIC_TRUNC:
12835 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12836 case AMDGPU::G_FMA:
12837 case AMDGPU::G_FMAD:
12838 case AMDGPU::G_FSQRT:
12839 case AMDGPU::G_FDIV:
12840 case AMDGPU::G_FREM:
12841 case AMDGPU::G_FPOW:
12842 case AMDGPU::G_FPEXT:
12843 case AMDGPU::G_FLOG:
12844 case AMDGPU::G_FLOG2:
12845 case AMDGPU::G_FLOG10:
12846 case AMDGPU::G_FPTRUNC:
12847 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12848 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12849 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12850 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12851 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12853 case AMDGPU::G_FNEG:
12854 case AMDGPU::G_FABS:
12855 case AMDGPU::G_FCOPYSIGN:
12857 case AMDGPU::G_FMINNUM:
12858 case AMDGPU::G_FMAXNUM:
12859 case AMDGPU::G_FMINNUM_IEEE:
12860 case AMDGPU::G_FMAXNUM_IEEE:
12861 case AMDGPU::G_FMINIMUM:
12862 case AMDGPU::G_FMAXIMUM: {
12870 case AMDGPU::G_BUILD_VECTOR:
12875 case AMDGPU::G_INTRINSIC:
12876 case AMDGPU::G_INTRINSIC_CONVERGENT:
12878 case Intrinsic::amdgcn_fmul_legacy:
12879 case Intrinsic::amdgcn_fmad_ftz:
12880 case Intrinsic::amdgcn_sqrt:
12881 case Intrinsic::amdgcn_fmed3:
12882 case Intrinsic::amdgcn_sin:
12883 case Intrinsic::amdgcn_cos:
12884 case Intrinsic::amdgcn_log:
12885 case Intrinsic::amdgcn_exp2:
12886 case Intrinsic::amdgcn_log_clamp:
12887 case Intrinsic::amdgcn_rcp:
12888 case Intrinsic::amdgcn_rcp_legacy:
12889 case Intrinsic::amdgcn_rsq:
12890 case Intrinsic::amdgcn_rsq_clamp:
12891 case Intrinsic::amdgcn_rsq_legacy:
12892 case Intrinsic::amdgcn_div_scale:
12893 case Intrinsic::amdgcn_div_fmas:
12894 case Intrinsic::amdgcn_div_fixup:
12895 case Intrinsic::amdgcn_fract:
12896 case Intrinsic::amdgcn_cvt_pkrtz:
12897 case Intrinsic::amdgcn_cubeid:
12898 case Intrinsic::amdgcn_cubema:
12899 case Intrinsic::amdgcn_cubesc:
12900 case Intrinsic::amdgcn_cubetc:
12901 case Intrinsic::amdgcn_frexp_mant:
12902 case Intrinsic::amdgcn_fdot2:
12903 case Intrinsic::amdgcn_trig_preop:
12918SDValue SITargetLowering::getCanonicalConstantFP(
12921 if (
C.isDenormal()) {
12935 if (
C.isSignaling()) {
12954 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
12957SDValue SITargetLowering::performFCanonicalizeCombine(
12959 DAGCombinerInfo &DCI)
const {
12962 EVT VT =
N->getValueType(0);
12971 EVT VT =
N->getValueType(0);
12972 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
12988 EVT EltVT =
Lo.getValueType();
12991 for (
unsigned I = 0;
I != 2; ++
I) {
12994 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
12995 CFP->getValueAPF());
12996 }
else if (
Op.isUndef()) {
13008 if (isa<ConstantFPSDNode>(NewElts[1]))
13009 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13014 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13065 if (!MinK || !MaxK)
13078 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13079 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13121 if (
Info->getMode().DX10Clamp) {
13130 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13162 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16());
13165 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.
hasIEEEMinMax3();
13170 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
13179 DAGCombinerInfo &DCI)
const {
13182 EVT VT =
N->getValueType(0);
13183 unsigned Opc =
N->getOpcode();
13197 N->getValueType(0),
13210 N->getValueType(0),
13220 if (
SDValue Med3 = performIntMed3ImmCombine(
13225 if (
SDValue Med3 = performIntMed3ImmCombine(
13231 if (
SDValue Med3 = performIntMed3ImmCombine(
13236 if (
SDValue Med3 = performIntMed3ImmCombine(
13246 (VT == MVT::f32 || VT == MVT::f64 ||
13250 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13261 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13262 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13271 DAGCombinerInfo &DCI)
const {
13272 EVT VT =
N->getValueType(0);
13295 if (
Info->getMode().DX10Clamp) {
13298 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13301 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13304 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13315 DAGCombinerInfo &DCI)
const {
13319 return DCI.DAG.getUNDEF(
N->getValueType(0));
13327 bool IsDivergentIdx,
13332 unsigned VecSize = EltSize * NumElem;
13335 if (VecSize <= 64 && EltSize < 32)
13344 if (IsDivergentIdx)
13348 unsigned NumInsts = NumElem +
13349 ((EltSize + 31) / 32) * NumElem ;
13354 return NumInsts <= 16;
13358 return NumInsts <= 15;
13363 if (isa<ConstantSDNode>(
Idx))
13376SDValue SITargetLowering::performExtractVectorEltCombine(
13377 SDNode *
N, DAGCombinerInfo &DCI)
const {
13383 EVT ResVT =
N->getValueType(0);
13402 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13430 DCI.AddToWorklist(Elt0.
getNode());
13431 DCI.AddToWorklist(Elt1.
getNode());
13453 if (!DCI.isBeforeLegalize())
13459 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13460 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13461 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13464 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13465 unsigned EltIdx = BitIndex / 32;
13466 unsigned LeftoverBitIdx = BitIndex % 32;
13470 DCI.AddToWorklist(Cast.
getNode());
13474 DCI.AddToWorklist(Elt.
getNode());
13477 DCI.AddToWorklist(Srl.
getNode());
13481 DCI.AddToWorklist(Trunc.
getNode());
13483 if (VecEltVT == ResVT) {
13495SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13496 DAGCombinerInfo &DCI)
const {
13510 EVT IdxVT =
Idx.getValueType();
13527 Src.getOperand(0).getValueType() == MVT::f16) {
13528 return Src.getOperand(0);
13531 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13532 APFloat Val = CFP->getValueAPF();
13533 bool LosesInfo =
true;
13543 DAGCombinerInfo &DCI)
const {
13545 "combine only useful on gfx8");
13547 SDValue TruncSrc =
N->getOperand(0);
13548 EVT VT =
N->getValueType(0);
13549 if (VT != MVT::f16)
13587unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13589 const SDNode *N1)
const {
13594 if (((VT == MVT::f32 &&
13596 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13616 EVT VT =
N->getValueType(0);
13617 if (VT != MVT::i32 && VT != MVT::i64)
13623 unsigned Opc =
N->getOpcode();
13646 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13668 DAGCombinerInfo &DCI)
const {
13672 EVT VT =
N->getValueType(0);
13682 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13686 if (NumBits <= 32 || NumBits > 64)
13698 unsigned NumUsers = 0;
13723 bool MulSignedLo =
false;
13724 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13733 if (VT != MVT::i64) {
13756 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13758 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13760 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13762 if (!MulLHSUnsigned32) {
13769 if (!MulRHSUnsigned32) {
13780 if (VT != MVT::i64)
13787static std::optional<ByteProvider<SDValue>>
13790 if (!Byte0 || Byte0->isConstantZero()) {
13791 return std::nullopt;
13794 if (Byte1 && !Byte1->isConstantZero()) {
13795 return std::nullopt;
13801 unsigned FirstCs =
First & 0x0c0c0c0c;
13802 unsigned SecondCs = Second & 0x0c0c0c0c;
13803 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13804 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13806 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13807 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13808 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13809 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13811 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13835 for (
int BPI = 0; BPI < 2; BPI++) {
13838 BPP = {Src1, Src0};
13840 unsigned ZeroMask = 0x0c0c0c0c;
13841 unsigned FMask = 0xFF << (8 * (3 - Step));
13843 unsigned FirstMask =
13844 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13845 unsigned SecondMask =
13846 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13850 int FirstGroup = -1;
13851 for (
int I = 0;
I < 2;
I++) {
13853 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
13854 return IterElt.SrcOp == *BPP.first.Src &&
13855 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13865 if (FirstGroup != -1) {
13867 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
13868 return IterElt.SrcOp == *BPP.second.Src &&
13869 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13875 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13883 unsigned ZeroMask = 0x0c0c0c0c;
13884 unsigned FMask = 0xFF << (8 * (3 - Step));
13888 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13892 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13903 if (Srcs.
size() == 1) {
13904 auto Elt = Srcs.
begin();
13908 if (Elt->PermMask == 0x3020100)
13915 auto FirstElt = Srcs.
begin();
13916 auto SecondElt = std::next(FirstElt);
13923 auto FirstMask = FirstElt->PermMask;
13924 auto SecondMask = SecondElt->PermMask;
13926 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13927 unsigned FirstPlusFour = FirstMask | 0x04040404;
13930 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13942 FirstElt = std::next(SecondElt);
13943 if (FirstElt == Srcs.
end())
13946 SecondElt = std::next(FirstElt);
13949 if (SecondElt == Srcs.
end()) {
13955 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
13961 return Perms.
size() == 2
13967 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13968 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13969 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13970 EntryMask += ZeroMask;
13975 auto Opcode =
Op.getOpcode();
13981static std::optional<bool>
13992 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13995 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13997 assert(!(S0IsUnsigned && S0IsSigned));
13998 assert(!(S1IsUnsigned && S1IsSigned));
14006 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14012 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14013 return std::nullopt;
14025 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14026 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14031 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14037 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14038 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14039 return std::nullopt;
14045 DAGCombinerInfo &DCI)
const {
14047 EVT VT =
N->getValueType(0);
14054 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14059 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14066 std::optional<bool> IsSigned;
14072 int ChainLength = 0;
14073 for (
int I = 0;
I < 4;
I++) {
14074 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14077 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14080 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14085 TempNode->getOperand(MulIdx), *Src0, *Src1,
14086 TempNode->getOperand(MulIdx)->getOperand(0),
14087 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14091 IsSigned = *IterIsSigned;
14092 if (*IterIsSigned != *IsSigned)
14095 auto AddIdx = 1 - MulIdx;
14098 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14099 Src2s.
push_back(TempNode->getOperand(AddIdx));
14109 TempNode->getOperand(AddIdx), *Src0, *Src1,
14110 TempNode->getOperand(AddIdx)->getOperand(0),
14111 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14115 if (*IterIsSigned != *IsSigned)
14119 ChainLength =
I + 2;
14123 TempNode = TempNode->getOperand(AddIdx);
14125 ChainLength =
I + 1;
14126 if (TempNode->getNumOperands() < 2)
14128 LHS = TempNode->getOperand(0);
14129 RHS = TempNode->getOperand(1);
14132 if (ChainLength < 2)
14138 if (ChainLength < 4) {
14148 bool UseOriginalSrc =
false;
14149 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14150 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14151 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14152 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14154 auto Src0Mask = Src0s.
begin()->PermMask;
14155 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14156 bool UniqueEntries =
true;
14157 for (
auto I = 1;
I < 4;
I++) {
14158 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14161 UniqueEntries =
false;
14167 if (UniqueEntries) {
14168 UseOriginalSrc =
true;
14170 auto FirstElt = Src0s.
begin();
14174 auto SecondElt = Src1s.
begin();
14176 SecondElt->DWordOffset);
14185 if (!UseOriginalSrc) {
14192 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14195 : Intrinsic::amdgcn_udot4,
14205 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14210 unsigned Opc =
LHS.getOpcode();
14215 Opc =
RHS.getOpcode();
14221 auto Cond =
RHS.getOperand(0);
14229 return DAG.
getNode(Opc, SL, VTList, Args);
14243 DAGCombinerInfo &DCI)
const {
14245 EVT VT =
N->getValueType(0);
14247 if (VT != MVT::i32)
14256 unsigned Opc =
RHS.getOpcode();
14262 auto Cond =
RHS.getOperand(0);
14270 return DAG.
getNode(Opc, SL, VTList, Args);
14284SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14285 DAGCombinerInfo &DCI)
const {
14287 if (
N->getValueType(0) != MVT::i32)
14298 unsigned LHSOpc =
LHS.getOpcode();
14299 unsigned Opc =
N->getOpcode();
14309 DAGCombinerInfo &DCI)
const {
14314 EVT VT =
N->getValueType(0);
14326 if (
A ==
LHS.getOperand(1)) {
14327 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14328 if (FusedOp != 0) {
14330 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14338 if (
A ==
RHS.getOperand(1)) {
14339 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14340 if (FusedOp != 0) {
14342 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14351 DAGCombinerInfo &DCI)
const {
14357 EVT VT =
N->getValueType(0);
14370 if (
A ==
LHS.getOperand(1)) {
14371 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14376 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14385 if (
A ==
RHS.getOperand(1)) {
14386 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14389 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14398 DAGCombinerInfo &DCI)
const {
14401 EVT VT =
N->getValueType(0);
14415 bool IsNegative =
false;
14416 if (CLHS->isExactlyValue(1.0) ||
14417 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14433 DAGCombinerInfo &DCI)
const {
14435 EVT VT =
N->getValueType(0);
14457 (
N->getFlags().hasAllowContract() &&
14458 FMA->getFlags().hasAllowContract())) {
14492 if (Vec1 == Vec2 || Vec3 == Vec4)
14498 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14499 (Vec1 == Vec4 && Vec2 == Vec3)) {
14508 DAGCombinerInfo &DCI)
const {
14514 EVT VT =
LHS.getValueType();
14517 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14519 CRHS = dyn_cast<ConstantSDNode>(LHS);
14543 return LHS.getOperand(0);
14549 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14550 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14551 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14558 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14559 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14567 return LHS.getOperand(0);
14571 if (VT != MVT::f32 && VT != MVT::f64 &&
14604 DAGCombinerInfo &DCI)
const {
14622 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14626 unsigned ShiftOffset = 8 *
Offset;
14628 ShiftOffset -=
C->getZExtValue();
14630 ShiftOffset +=
C->getZExtValue();
14632 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14634 MVT::f32, Shifted);
14645 DCI.AddToWorklist(
N);
14652 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14658 DAGCombinerInfo &DCI)
const {
14668 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14671 APFloat One(
F.getSemantics(),
"1.0");
14673 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14683 switch (
N->getOpcode()) {
14685 return performAddCombine(
N, DCI);
14687 return performSubCombine(
N, DCI);
14690 return performAddCarrySubCarryCombine(
N, DCI);
14692 return performFAddCombine(
N, DCI);
14694 return performFSubCombine(
N, DCI);
14696 return performFDivCombine(
N, DCI);
14698 return performSetCCCombine(
N, DCI);
14711 return performMinMaxCombine(
N, DCI);
14713 return performFMACombine(
N, DCI);
14715 return performAndCombine(
N, DCI);
14717 return performOrCombine(
N, DCI);
14720 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14721 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14727 return performXorCombine(
N, DCI);
14729 return performZeroExtendCombine(
N, DCI);
14731 return performSignExtendInRegCombine(
N , DCI);
14733 return performClassCombine(
N, DCI);
14735 return performFCanonicalizeCombine(
N, DCI);
14737 return performRcpCombine(
N, DCI);
14752 return performUCharToFloatCombine(
N, DCI);
14754 return performFCopySignCombine(
N, DCI);
14759 return performCvtF32UByteNCombine(
N, DCI);
14761 return performFMed3Combine(
N, DCI);
14763 return performCvtPkRTZCombine(
N, DCI);
14765 return performClampCombine(
N, DCI);
14768 EVT VT =
N->getValueType(0);
14771 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14774 EVT EltVT = Src.getValueType();
14775 if (EltVT != MVT::i16)
14785 return performExtractVectorEltCombine(
N, DCI);
14787 return performInsertVectorEltCombine(
N, DCI);
14789 return performFPRoundCombine(
N, DCI);
14791 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14797 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14798 return performMemSDNodeCombine(MemNode, DCI);
14811 default:
return ~0u;
14812 case AMDGPU::sub0:
return 0;
14813 case AMDGPU::sub1:
return 1;
14814 case AMDGPU::sub2:
return 2;
14815 case AMDGPU::sub3:
return 3;
14816 case AMDGPU::sub4:
return 4;
14823 unsigned Opcode =
Node->getMachineOpcode();
14827 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14833 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14834 unsigned NewDmask = 0;
14837 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14838 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14841 unsigned TFCLane = 0;
14842 bool HasChain =
Node->getNumValues() > 1;
14844 if (OldDmask == 0) {
14852 TFCLane = OldBitsSet;
14860 if (
I.getUse().getResNo() != 0)
14864 if (!
I->isMachineOpcode() ||
14865 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14877 if (UsesTFC && Lane == TFCLane) {
14882 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14884 Dmask &= ~(1 << Comp);
14892 NewDmask |= 1 << Comp;
14897 bool NoChannels = !NewDmask;
14904 if (OldBitsSet == 1)
14910 if (NewDmask == OldDmask)
14919 unsigned NewChannels = BitsSet + UsesTFC;
14923 assert(NewOpcode != -1 &&
14924 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14925 "failed to find equivalent MIMG op");
14933 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14935 MVT ResultVT = NewChannels == 1 ?
14937 NewChannels == 5 ? 8 : NewChannels);
14951 if (NewChannels == 1) {
14961 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14966 if (i || !NoChannels)
14971 if (NewUser !=
User) {
14979 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
14980 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
14981 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
14982 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
14992 Op =
Op.getOperand(0);
14994 return isa<FrameIndexSDNode>(
Op);
15003 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15004 SDValue SrcVal = Node->getOperand(2);
15012 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15014 SDNode *Glued = Node->getGluedNode();
15016 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15023 return ToResultReg.
getNode();
15028 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15036 Node->getOperand(i).getValueType(),
15037 Node->getOperand(i)), 0));
15048 unsigned Opcode = Node->getMachineOpcode();
15050 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15051 !
TII->isGather4(Opcode) &&
15053 return adjustWritemask(Node, DAG);
15056 if (Opcode == AMDGPU::INSERT_SUBREG ||
15057 Opcode == AMDGPU::REG_SEQUENCE) {
15063 case AMDGPU::V_DIV_SCALE_F32_e64:
15064 case AMDGPU::V_DIV_SCALE_F64_e64: {
15068 SDValue Src0 = Node->getOperand(1);
15069 SDValue Src1 = Node->getOperand(3);
15070 SDValue Src2 = Node->getOperand(5);
15074 (Src0 == Src1 || Src0 == Src2))
15131 unsigned InitIdx = 0;
15133 if (
TII->isImage(
MI)) {
15141 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15142 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15143 unsigned D16Val = D16 ? D16->getImm() : 0;
15145 if (!TFEVal && !LWEVal)
15156 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15158 unsigned dmask = MO_Dmask->
getImm();
15165 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15171 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15172 if (DstSize < InitIdx)
15175 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15183 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15184 unsigned NewDst = 0;
15193 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15194 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15212 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15225 if (
TII->isVOP3(
MI.getOpcode())) {
15227 TII->legalizeOperandsVOP3(
MRI,
MI);
15232 if (!
MI.getDesc().operands().empty()) {
15233 unsigned Opc =
MI.getOpcode();
15234 bool HasAGPRs =
Info->mayNeedAGPRs();
15242 if ((
I == Src2Idx) && (HasAGPRs))
15245 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15247 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15248 if (!
TRI->hasAGPRs(RC))
15250 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15251 if (!Src || !Src->isCopy() ||
15252 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15254 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15258 MRI.setRegClass(
Op.getReg(), NewRC);
15265 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15266 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15267 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15268 if (
TRI->isVectorSuperClass(RC)) {
15269 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15270 MRI.setRegClass(Src2->getReg(), NewRC);
15271 if (Src2->isTied())
15272 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15281 if (
TII->isImage(
MI))
15282 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15308 MVT::v2i32, Ops0), 0);
15338 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15360std::pair<unsigned, const TargetRegisterClass *>
15367 if (Constraint.
size() == 1) {
15369 switch (Constraint[0]) {
15376 RC = &AMDGPU::SReg_32RegClass;
15379 RC = &AMDGPU::SGPR_64RegClass;
15384 return std::pair(0U,
nullptr);
15391 RC = &AMDGPU::VGPR_32RegClass;
15396 return std::pair(0U,
nullptr);
15405 RC = &AMDGPU::AGPR_32RegClass;
15410 return std::pair(0U,
nullptr);
15419 return std::pair(0U, RC);
15424 if (
RegName.consume_front(
"v")) {
15425 RC = &AMDGPU::VGPR_32RegClass;
15426 }
else if (
RegName.consume_front(
"s")) {
15427 RC = &AMDGPU::SGPR_32RegClass;
15428 }
else if (
RegName.consume_front(
"a")) {
15429 RC = &AMDGPU::AGPR_32RegClass;
15434 if (
RegName.consume_front(
"[")) {
15444 RC =
TRI->getVGPRClassForBitWidth(Width);
15446 RC =
TRI->getSGPRClassForBitWidth(Width);
15448 RC =
TRI->getAGPRClassForBitWidth(Width);
15450 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15451 return std::pair(Reg, RC);
15456 if (!
Failed && Idx < RC->getNumRegs())
15464 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15470 if (Constraint.
size() == 1) {
15471 switch (Constraint[0]) {
15480 }
else if (Constraint ==
"DA" ||
15481 Constraint ==
"DB") {
15489 if (Constraint.
size() == 1) {
15490 switch (Constraint[0]) {
15506 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15513 std::vector<SDValue> &Ops,
15528 unsigned Size =
Op.getScalarValueSizeInBits();
15536 Val =
C->getSExtValue();
15540 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15546 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15549 Val =
C->getSExtValue();
15553 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15563 if (Constraint.
size() == 1) {
15564 switch (Constraint[0]) {
15568 return isInt<16>(Val);
15572 return isInt<32>(Val);
15579 }
else if (Constraint.
size() == 2) {
15580 if (Constraint ==
"DA") {
15581 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15582 int64_t LoBits =
static_cast<int32_t
>(Val);
15586 if (Constraint ==
"DB") {
15594 unsigned MaxSize)
const {
15595 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15598 MVT VT =
Op.getSimpleValueType();
15623 switch (UnalignedClassID) {
15624 case AMDGPU::VReg_64RegClassID:
15625 return AMDGPU::VReg_64_Align2RegClassID;
15626 case AMDGPU::VReg_96RegClassID:
15627 return AMDGPU::VReg_96_Align2RegClassID;
15628 case AMDGPU::VReg_128RegClassID:
15629 return AMDGPU::VReg_128_Align2RegClassID;
15630 case AMDGPU::VReg_160RegClassID:
15631 return AMDGPU::VReg_160_Align2RegClassID;
15632 case AMDGPU::VReg_192RegClassID:
15633 return AMDGPU::VReg_192_Align2RegClassID;
15634 case AMDGPU::VReg_224RegClassID:
15635 return AMDGPU::VReg_224_Align2RegClassID;
15636 case AMDGPU::VReg_256RegClassID:
15637 return AMDGPU::VReg_256_Align2RegClassID;
15638 case AMDGPU::VReg_288RegClassID:
15639 return AMDGPU::VReg_288_Align2RegClassID;
15640 case AMDGPU::VReg_320RegClassID:
15641 return AMDGPU::VReg_320_Align2RegClassID;
15642 case AMDGPU::VReg_352RegClassID:
15643 return AMDGPU::VReg_352_Align2RegClassID;
15644 case AMDGPU::VReg_384RegClassID:
15645 return AMDGPU::VReg_384_Align2RegClassID;
15646 case AMDGPU::VReg_512RegClassID:
15647 return AMDGPU::VReg_512_Align2RegClassID;
15648 case AMDGPU::VReg_1024RegClassID:
15649 return AMDGPU::VReg_1024_Align2RegClassID;
15650 case AMDGPU::AReg_64RegClassID:
15651 return AMDGPU::AReg_64_Align2RegClassID;
15652 case AMDGPU::AReg_96RegClassID:
15653 return AMDGPU::AReg_96_Align2RegClassID;
15654 case AMDGPU::AReg_128RegClassID:
15655 return AMDGPU::AReg_128_Align2RegClassID;
15656 case AMDGPU::AReg_160RegClassID:
15657 return AMDGPU::AReg_160_Align2RegClassID;
15658 case AMDGPU::AReg_192RegClassID:
15659 return AMDGPU::AReg_192_Align2RegClassID;
15660 case AMDGPU::AReg_256RegClassID:
15661 return AMDGPU::AReg_256_Align2RegClassID;
15662 case AMDGPU::AReg_512RegClassID:
15663 return AMDGPU::AReg_512_Align2RegClassID;
15664 case AMDGPU::AReg_1024RegClassID:
15665 return AMDGPU::AReg_1024_Align2RegClassID;
15681 if (
Info->isEntryFunction()) {
15688 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15690 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15691 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15692 &AMDGPU::SGPR_64RegClass);
15693 Info->setSGPRForEXECCopy(SReg);
15696 Info->getStackPtrOffsetReg()));
15697 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15698 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15702 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15703 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15705 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15706 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15708 Info->limitOccupancy(MF);
15710 if (ST.isWave32() && !MF.
empty()) {
15711 for (
auto &
MBB : MF) {
15712 for (
auto &
MI :
MBB) {
15713 TII->fixImplicitOperands(
MI);
15723 if (ST.needsAlignedVGPRs()) {
15724 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
15730 if (NewClassID != -1)
15731 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15740 const APInt &DemandedElts,
15742 unsigned Depth)
const {
15744 unsigned Opc =
Op.getOpcode();
15747 unsigned IID =
Op.getConstantOperandVal(0);
15749 case Intrinsic::amdgcn_mbcnt_lo:
15750 case Intrinsic::amdgcn_mbcnt_hi: {
15757 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15759 MaxActiveBits += Src1ValBits ? 1 : 0;
15760 unsigned Size =
Op.getValueType().getSizeInBits();
15761 if (MaxActiveBits <
Size)
15770 Op, Known, DemandedElts, DAG,
Depth);
15785 unsigned MaxValue =
15794 switch (
MI->getOpcode()) {
15795 case AMDGPU::G_INTRINSIC:
15796 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15798 case Intrinsic::amdgcn_workitem_id_x:
15801 case Intrinsic::amdgcn_workitem_id_y:
15804 case Intrinsic::amdgcn_workitem_id_z:
15807 case Intrinsic::amdgcn_mbcnt_lo:
15808 case Intrinsic::amdgcn_mbcnt_hi: {
15810 unsigned Size =
MRI.getType(R).getSizeInBits();
15814 case Intrinsic::amdgcn_groupstaticsize: {
15825 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15828 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15831 case AMDGPU::G_AMDGPU_SMED3:
15832 case AMDGPU::G_AMDGPU_UMED3: {
15833 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15860 unsigned Depth)
const {
15862 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15868 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15895 if (Header->getAlignment() != PrefAlign)
15896 return Header->getAlignment();
15898 unsigned LoopSize = 0;
15906 LoopSize +=
TII->getInstSizeInBytes(
MI);
15907 if (LoopSize > 192)
15912 if (LoopSize <= 64)
15915 if (LoopSize <= 128)
15916 return CacheLineAlign;
15922 auto I = Exit->getFirstNonDebugInstr();
15923 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15924 return CacheLineAlign;
15933 if (PreTerm == Pre->
begin() ||
15934 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15938 auto ExitHead = Exit->getFirstNonDebugInstr();
15939 if (ExitHead == Exit->end() ||
15940 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15945 return CacheLineAlign;
15953 N =
N->getOperand(0).getNode();
15964 switch (
N->getOpcode()) {
15972 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15973 return !
TRI->isSGPRReg(
MRI, Reg);
15979 return !
TRI->isSGPRReg(
MRI, Reg);
15983 unsigned AS = L->getAddressSpace();
16014 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16016 return A->readMem() &&
A->writeMem();
16051 unsigned Depth)
const {
16056 if (
Info->getMode().DX10Clamp)
16069static bool fpModeMatchesGlobalFPAtomicMode(
const AtomicRMWInst *RMW) {
16071 auto DenormMode = RMW->
getParent()->getParent()->getDenormalMode(
Flt);
16083 return F->getFnAttribute(
"amdgpu-unsafe-fp-atomics").getValueAsString() !=
16096 <<
"Hardware instruction generated for atomic "
16098 <<
" operation at memory scope " << MemScope;
16102 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16103 Type *EltTy = VT->getElementType();
16104 return VT->getNumElements() == 2 &&
16136 bool HasSystemScope =
16223 if (HasSystemScope)
16278 if (HasSystemScope)
16313 if (HasSystemScope)
16350 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16352 : &AMDGPU::SReg_32RegClass;
16353 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16354 return TRI->getEquivalentSGPRClass(RC);
16355 if (
TRI->isSGPRClass(RC) && isDivergent)
16356 return TRI->getEquivalentVGPRClass(RC);
16368 unsigned WaveSize) {
16373 if (!
IT ||
IT->getBitWidth() != WaveSize)
16376 if (!isa<Instruction>(V))
16378 if (!Visited.
insert(V).second)
16380 bool Result =
false;
16381 for (
const auto *U : V->users()) {
16382 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16383 if (V == U->getOperand(1)) {
16384 switch (Intrinsic->getIntrinsicID()) {
16388 case Intrinsic::amdgcn_if_break:
16389 case Intrinsic::amdgcn_if:
16390 case Intrinsic::amdgcn_else:
16395 if (V == U->getOperand(0)) {
16396 switch (Intrinsic->getIntrinsicID()) {
16400 case Intrinsic::amdgcn_end_cf:
16401 case Intrinsic::amdgcn_loop:
16407 Result =
hasCFUser(U, Visited, WaveSize);
16416 const Value *V)
const {
16417 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16418 if (CI->isInlineAsm()) {
16427 for (
auto &TC : TargetConstraints) {
16431 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16444 for (;
I != E; ++
I) {
16445 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16468 return MRI.hasOneNonDBGUse(N0);
16475 if (
I.getMetadata(
"amdgpu.noclobber"))
16477 if (
I.getMetadata(
"amdgpu.last.use"))
16487 if (!Def->isMachineOpcode())
16497 if (
II.isCompare() &&
II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16498 PhysReg = AMDGPU::SCC;
16500 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16514 "this cannot be replaced with add");
16520 "target should have atomic fadd instructions");
16523 "generic atomicrmw expansion only supports FP32 operand in flat "
16597 for (
auto &
P : MDs)
16608 {
Addr},
nullptr,
"is.shared");
16609 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16614 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16619 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16625 Value *LoadedPrivate =
16626 Builder.
CreateLoad(ValTy, CastToPrivate,
"loaded.private");
16634 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isHalf2OrBFloat2(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static bool isHalf2(Type *Ty)
bool unsafeFPAtomicsDisabled(Function *F)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool isBFloat2(Type *Ty)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasD16Images() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
LLVMContext & getContext() const
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ FPTRUNC_ROUND
FPTRUNC_ROUND - This corresponds to the fptrunc_round intrinsic.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const