40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
55 "amdgpu-disable-loop-alignment",
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
62 cl::desc(
"Use indirect register addressing for divergent indexes"),
76 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
77 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
79 return AMDGPU::SGPR0 + Reg;
195 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
196 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
197 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
198 MVT::i1, MVT::v32i32},
202 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
203 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
204 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
205 MVT::i1, MVT::v32i32},
274 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
281 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
282 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
283 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
286 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
287 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
288 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
292 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
293 MVT::v3i16, MVT::v4i16, MVT::Other},
298 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
314 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
315 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
316 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
317 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
318 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
319 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
320 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
321 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
353 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
367 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
381 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
395 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
409 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
424 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
433 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
434 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
439 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
443 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
444 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
445 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
446 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
527 {MVT::f32, MVT::f64},
Legal);
620 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
621 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
622 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
757 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
766 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
788 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
789 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
792 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
800 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
816 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
836 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
837 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
838 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
839 MVT::v32f16, MVT::v32bf16},
855 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
857 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
868 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
869 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
870 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
871 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
875 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
876 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
877 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
878 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
967 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
980 EVT DestVT,
EVT SrcVT)
const {
990 LLT DestTy,
LLT SrcTy)
const {
991 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
992 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1018 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1020 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1047 return (NumElts + 1) / 2;
1053 return NumElts * ((
Size + 31) / 32);
1062 EVT VT,
EVT &IntermediateVT,
1063 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1072 if (ScalarVT == MVT::bf16) {
1073 RegisterVT = MVT::i32;
1074 IntermediateVT = MVT::v2bf16;
1076 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1077 IntermediateVT = RegisterVT;
1079 NumIntermediates = (NumElts + 1) / 2;
1080 return NumIntermediates;
1085 IntermediateVT = RegisterVT;
1086 NumIntermediates = NumElts;
1087 return NumIntermediates;
1090 if (Size < 16 && Subtarget->has16BitInsts()) {
1092 RegisterVT = MVT::i16;
1093 IntermediateVT = ScalarVT;
1094 NumIntermediates = NumElts;
1095 return NumIntermediates;
1100 RegisterVT = MVT::i32;
1101 IntermediateVT = ScalarVT;
1102 NumIntermediates = NumElts;
1103 return NumIntermediates;
1107 RegisterVT = MVT::i32;
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts * ((
Size + 31) / 32);
1110 return NumIntermediates;
1115 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1120 unsigned MaxNumLanes) {
1121 assert(MaxNumLanes != 0);
1124 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1125 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1136 unsigned MaxNumLanes) {
1137 auto *ST = dyn_cast<StructType>(Ty);
1142 assert(ST->getNumContainedTypes() == 2 &&
1143 ST->getContainedType(1)->isIntegerTy(32));
1158 DL.getPointerSizeInBits(AS) == 192)
1168 DL.getPointerSizeInBits(AS) == 160) ||
1170 DL.getPointerSizeInBits(AS) == 192))
1178 unsigned IntrID)
const {
1180 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1195 if (RsrcIntr->IsImage) {
1203 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1210 Info.ptrVal = RsrcArg;
1218 if (RsrcIntr->IsImage) {
1219 unsigned MaxNumLanes = 4;
1234 std::numeric_limits<unsigned>::max());
1244 if (RsrcIntr->IsImage) {
1245 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1263 if (RsrcIntr->IsImage && BaseOpcode->
NoReturn) {
1265 Info.memVT = MVT::i32;
1272 case Intrinsic::amdgcn_raw_buffer_load_lds:
1273 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1274 case Intrinsic::amdgcn_struct_buffer_load_lds:
1275 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1276 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1281 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1282 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1283 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1284 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1287 std::numeric_limits<unsigned>::max());
1297 case Intrinsic::amdgcn_ds_ordered_add:
1298 case Intrinsic::amdgcn_ds_ordered_swap: {
1311 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1312 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1315 Info.ptrVal =
nullptr;
1320 case Intrinsic::amdgcn_ds_append:
1321 case Intrinsic::amdgcn_ds_consume: {
1334 case Intrinsic::amdgcn_global_atomic_csub: {
1344 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1354 case Intrinsic::amdgcn_global_atomic_fadd:
1355 case Intrinsic::amdgcn_global_atomic_fmin:
1356 case Intrinsic::amdgcn_global_atomic_fmax:
1357 case Intrinsic::amdgcn_global_atomic_fmin_num:
1358 case Intrinsic::amdgcn_global_atomic_fmax_num:
1359 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1360 case Intrinsic::amdgcn_flat_atomic_fadd:
1361 case Intrinsic::amdgcn_flat_atomic_fmin:
1362 case Intrinsic::amdgcn_flat_atomic_fmax:
1363 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1364 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1365 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1366 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1367 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1378 case Intrinsic::amdgcn_global_load_tr_b64:
1379 case Intrinsic::amdgcn_global_load_tr_b128: {
1387 case Intrinsic::amdgcn_ds_gws_init:
1388 case Intrinsic::amdgcn_ds_gws_barrier:
1389 case Intrinsic::amdgcn_ds_gws_sema_v:
1390 case Intrinsic::amdgcn_ds_gws_sema_br:
1391 case Intrinsic::amdgcn_ds_gws_sema_p:
1392 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1402 Info.memVT = MVT::i32;
1406 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1412 case Intrinsic::amdgcn_global_load_lds: {
1414 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1420 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1430 Info.memVT = MVT::i32;
1445 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1448 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1449 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1461 Type *&AccessTy)
const {
1463 switch (
II->getIntrinsicID()) {
1464 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1465 case Intrinsic::amdgcn_ds_append:
1466 case Intrinsic::amdgcn_ds_consume:
1467 case Intrinsic::amdgcn_ds_ordered_add:
1468 case Intrinsic::amdgcn_ds_ordered_swap:
1469 case Intrinsic::amdgcn_flat_atomic_fadd:
1470 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1471 case Intrinsic::amdgcn_flat_atomic_fmax:
1472 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1473 case Intrinsic::amdgcn_flat_atomic_fmin:
1474 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1475 case Intrinsic::amdgcn_global_atomic_csub:
1476 case Intrinsic::amdgcn_global_atomic_fadd:
1477 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1478 case Intrinsic::amdgcn_global_atomic_fmax:
1479 case Intrinsic::amdgcn_global_atomic_fmax_num:
1480 case Intrinsic::amdgcn_global_atomic_fmin:
1481 case Intrinsic::amdgcn_global_atomic_fmin_num:
1482 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1483 case Intrinsic::amdgcn_global_load_tr_b64:
1484 case Intrinsic::amdgcn_global_load_tr_b128:
1485 Ptr =
II->getArgOperand(0);
1487 case Intrinsic::amdgcn_global_load_lds:
1488 Ptr =
II->getArgOperand(1);
1493 AccessTy =
II->getType();
1499 unsigned AddrSpace)
const {
1511 return AM.
Scale == 0 &&
1513 AM.
BaseOffs, AddrSpace, FlatVariant));
1533 return isLegalMUBUFAddressingMode(AM);
1536bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1547 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1559 if (AM.HasBaseReg) {
1590 return isLegalMUBUFAddressingMode(AM);
1597 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1647 : isLegalMUBUFAddressingMode(AM);
1694 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1708 Alignment < RequiredAlignment)
1729 RequiredAlignment =
Align(4);
1747 *IsFast = (Alignment >= RequiredAlignment) ? 64
1748 : (Alignment <
Align(4)) ? 32
1770 *IsFast = (Alignment >= RequiredAlignment) ? 96
1771 : (Alignment <
Align(4)) ? 32
1784 RequiredAlignment =
Align(8);
1795 *IsFast = (Alignment >= RequiredAlignment) ? 128
1796 : (Alignment <
Align(4)) ? 32
1813 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1815 return Alignment >= RequiredAlignment ||
1820 bool AlignedBy4 = Alignment >=
Align(4);
1822 *IsFast = AlignedBy4;
1824 return AlignedBy4 ||
1834 bool AlignedBy4 = Alignment >=
Align(4);
1836 *IsFast = AlignedBy4;
1847 return Alignment >=
Align(4) ||
1861 return Size >= 32 && Alignment >=
Align(4);
1866 unsigned *IsFast)
const {
1868 Alignment, Flags, IsFast);
1878 if (
Op.size() >= 16 &&
1882 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1890 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1900 unsigned DestAS)
const {
1908 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1912 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1932 unsigned Index)
const {
1979 std::tie(InputPtrReg, RC, ArgTy) =
1989 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1995 const SDLoc &SL)
const {
2002 const SDLoc &SL)
const {
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2034 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2043SDValue SITargetLowering::lowerKernargMemParameter(
2055 int64_t OffsetDiff =
Offset - AlignDownOffset;
2061 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2071 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2082 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2129 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2158 Reg = &WorkGroupIDX;
2159 RC = &AMDGPU::SReg_32RegClass;
2163 Reg = &WorkGroupIDY;
2164 RC = &AMDGPU::SReg_32RegClass;
2168 Reg = &WorkGroupIDZ;
2169 RC = &AMDGPU::SReg_32RegClass;
2200 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2204 "vector type argument should have been split");
2209 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2218 "unexpected vector split in ps argument type");
2232 Info->markPSInputAllocated(PSInputNum);
2234 Info->markPSInputEnabled(PSInputNum);
2251 if (
Info.hasWorkItemIDX()) {
2257 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2261 if (
Info.hasWorkItemIDY()) {
2267 unsigned Reg = AMDGPU::VGPR1;
2275 if (
Info.hasWorkItemIDZ()) {
2281 unsigned Reg = AMDGPU::VGPR2;
2301 if (RegIdx == ArgVGPRs.
size()) {
2308 unsigned Reg = ArgVGPRs[RegIdx];
2310 assert(Reg != AMDGPU::NoRegister);
2320 unsigned NumArgRegs) {
2323 if (RegIdx == ArgSGPRs.
size())
2326 unsigned Reg = ArgSGPRs[RegIdx];
2328 assert(Reg != AMDGPU::NoRegister);
2342 assert(Reg != AMDGPU::NoRegister);
2368 const unsigned Mask = 0x3ff;
2371 if (
Info.hasWorkItemIDX()) {
2373 Info.setWorkItemIDX(Arg);
2376 if (
Info.hasWorkItemIDY()) {
2378 Info.setWorkItemIDY(Arg);
2381 if (
Info.hasWorkItemIDZ())
2393 const unsigned Mask = 0x3ff;
2418 if (
Info.hasImplicitArgPtr())
2426 if (
Info.hasWorkGroupIDX())
2429 if (
Info.hasWorkGroupIDY())
2432 if (
Info.hasWorkGroupIDZ())
2435 if (
Info.hasLDSKernelId())
2447 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2454 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2460 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2468 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2483 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2489 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2495 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2510 unsigned LastExplicitArgOffset =
2513 bool InPreloadSequence =
true;
2515 for (
auto &Arg :
F.args()) {
2516 if (!InPreloadSequence || !Arg.hasInRegAttr())
2519 int ArgIdx = Arg.getArgNo();
2522 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2523 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2526 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2527 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2529 assert(ArgLocs[ArgIdx].isMemLoc());
2530 auto &ArgLoc = ArgLocs[InIdx];
2532 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2534 unsigned NumAllocSGPRs =
2535 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2538 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2539 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2540 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2544 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2545 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2547 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2549 InPreloadSequence =
false;
2555 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2557 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2559 if (PreloadRegs->
size() > 1)
2560 RC = &AMDGPU::SGPR_32RegClass;
2561 for (
auto &Reg : *PreloadRegs) {
2567 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2576 if (
Info.hasLDSKernelId()) {
2578 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2588 bool IsShader)
const {
2596 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2598 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2602 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2603 Info.hasWorkGroupIDY() +
2604 Info.hasWorkGroupIDZ() +
2605 Info.hasWorkGroupInfo();
2606 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2608 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 if (!HasArchitectedSGPRs) {
2614 if (
Info.hasWorkGroupIDX()) {
2616 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 if (
Info.hasWorkGroupIDY()) {
2622 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626 if (
Info.hasWorkGroupIDZ()) {
2628 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633 if (
Info.hasWorkGroupInfo()) {
2635 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2639 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2654 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2656 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2661 Info.getNumPreloadedSGPRs() >= 16);
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(
true);
2682 HasStackObjects =
true;
2686 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2698 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2727 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2735 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!
MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2742 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2756 return !
Info->isEntryFunction();
2768 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2777 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2778 RC = &AMDGPU::SGPR_64RegClass;
2779 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2780 RC = &AMDGPU::SGPR_32RegClass;
2786 Entry->addLiveIn(*
I);
2791 for (
auto *Exit : Exits)
2793 TII->get(TargetOpcode::COPY), *
I)
2811 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2830 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2831 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2839 !
Info->hasWorkGroupIDZ());
2858 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2859 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2862 Info->markPSInputAllocated(0);
2863 Info->markPSInputEnabled(0);
2874 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2875 if ((PsInputBits & 0x7F) == 0 ||
2876 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2879 }
else if (IsKernel) {
2882 Splits.
append(Ins.begin(), Ins.end());
2895 }
else if (!IsGraphics) {
2920 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2930 if (IsEntryFunc && VA.
isMemLoc()) {
2953 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2957 int64_t OffsetDiff =
Offset - AlignDownOffset;
2964 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2975 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2976 Ins[i].Flags.isSExt(), &Ins[i]);
2984 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2987 if (PreloadRegs.
size() == 1) {
2988 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2993 TRI->getRegSizeInBits(*RC)));
3001 for (
auto Reg : PreloadRegs) {
3008 PreloadRegs.size()),
3025 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3026 Ins[i].Flags.isSExt(), &Ins[i]);
3031 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3032 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3037 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3051 if (!IsEntryFunc && VA.
isMemLoc()) {
3052 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3063 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3064 RC = &AMDGPU::VGPR_32RegClass;
3065 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3066 RC = &AMDGPU::SGPR_32RegClass;
3127 Info->setBytesInStackArgArea(StackArgSize);
3129 return Chains.
empty() ? Chain :
3147 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3153 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3154 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3155 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3178 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3196 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3197 ++
I, ++RealRVLocIdx) {
3201 SDValue Arg = OutVals[RealRVLocIdx];
3229 if (!
Info->isEntryFunction()) {
3235 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3237 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3253 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3338 auto &ArgUsageInfo =
3340 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3370 std::tie(OutgoingArg, ArgRC, ArgTy) =
3378 std::tie(IncomingArg, IncomingArgRC, Ty) =
3380 assert(IncomingArgRC == ArgRC);
3383 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3391 InputReg = getImplicitArgPtr(DAG,
DL);
3393 std::optional<uint32_t> Id =
3395 if (Id.has_value()) {
3407 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3411 unsigned SpecialArgOffset =
3425 std::tie(OutgoingArg, ArgRC, Ty) =
3428 std::tie(OutgoingArg, ArgRC, Ty) =
3431 std::tie(OutgoingArg, ArgRC, Ty) =
3446 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3447 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3448 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3465 InputReg = InputReg.
getNode() ?
3474 InputReg = InputReg.
getNode() ?
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3489 IncomingArgX ? *IncomingArgX :
3490 IncomingArgY ? *IncomingArgY :
3491 *IncomingArgZ, ~0u);
3498 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3539 if (Callee->isDivergent())
3546 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3550 if (!CallerPreserved)
3553 bool CCMatch = CallerCC == CalleeCC;
3566 if (Arg.hasByValAttr())
3580 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3581 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3590 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3624 if (IsChainCallConv) {
3628 RequestedExec = CLI.
Args.back();
3629 assert(RequestedExec.
Node &&
"No node for EXEC");
3634 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3635 CLI.
Outs.pop_back();
3639 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3640 CLI.
Outs.pop_back();
3645 "Haven't popped all the pieces of the EXEC mask");
3656 bool IsSibCall =
false;
3670 "unsupported call to variadic function ");
3678 "unsupported required tail call to function ");
3683 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3687 "site marked musttail or on llvm.amdgcn.cs.chain");
3694 if (!TailCallOpt && IsTailCall)
3739 if (!IsSibCall || IsChainCallConv) {
3746 RegsToPass.emplace_back(IsChainCallConv
3747 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3748 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3755 MVT PtrVT = MVT::i32;
3758 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3786 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3794 int32_t
Offset = LocMemOffset;
3801 unsigned OpSize = Flags.isByVal() ?
3807 ? Flags.getNonZeroByValAlign()
3834 if (Outs[i].Flags.isByVal()) {
3836 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3839 Outs[i].Flags.getNonZeroByValAlign(),
3841 nullptr, std::nullopt, DstInfo,
3847 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3853 if (!MemOpChains.
empty())
3859 for (
auto &RegToPass : RegsToPass) {
3861 RegToPass.second, InGlue);
3870 if (IsTailCall && !IsSibCall) {
3875 std::vector<SDValue> Ops;
3876 Ops.push_back(Chain);
3877 Ops.push_back(Callee);
3894 if (IsChainCallConv)
3895 Ops.push_back(RequestedExec.
Node);
3899 for (
auto &RegToPass : RegsToPass) {
3901 RegToPass.second.getValueType()));
3906 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3907 assert(Mask &&
"Missing call preserved mask for calling convention");
3917 MVT::Glue, GlueOps),
3922 Ops.push_back(InGlue);
3941 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3946 Chain = Call.getValue(0);
3947 InGlue = Call.getValue(1);
3949 uint64_t CalleePopBytes = NumBytes;
3968 EVT VT =
Op.getValueType();
3983 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3994 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3995 if (Alignment && *Alignment > StackAlign) {
4016 if (isa<ConstantSDNode>(
Size))
4023 if (
Op.getValueType() != MVT::i32)
4042 assert(
Op.getValueType() == MVT::i32);
4051 Op.getOperand(0), IntrinID, GetRoundBothImm);
4085 SDValue RoundModeTimesNumBits =
4105 TableEntry, EnumOffset);
4119 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4121 static_cast<uint32_t>(ConstMode->getZExtValue()),
4133 if (UseReducedTable) {
4139 SDValue RoundModeTimesNumBits =
4159 SDValue RoundModeTimesNumBits =
4168 NewMode = TruncTable;
4177 ReadFirstLaneID, NewMode);
4190 IntrinID, RoundBothImm, NewMode);
4196 if (
Op->isDivergent())
4199 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4215 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4216 EVT SrcVT = Src.getValueType();
4225 EVT DstVT =
Op.getValueType();
4234 if (
Op.getValueType() != MVT::i64)
4248 Op.getOperand(0), IntrinID, ModeHwRegImm);
4250 Op.getOperand(0), IntrinID, TrapHwRegImm);
4264 if (
Op.getOperand(1).getValueType() != MVT::i64)
4276 ReadFirstLaneID, NewModeReg);
4278 ReadFirstLaneID, NewTrapReg);
4280 unsigned ModeHwReg =
4283 unsigned TrapHwReg =
4291 IntrinID, ModeHwRegImm, NewModeReg);
4294 IntrinID, TrapHwRegImm, NewTrapReg);
4301 .
Case(
"m0", AMDGPU::M0)
4302 .
Case(
"exec", AMDGPU::EXEC)
4303 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4304 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4305 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4306 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4307 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4310 if (Reg == AMDGPU::NoRegister) {
4324 case AMDGPU::EXEC_LO:
4325 case AMDGPU::EXEC_HI:
4326 case AMDGPU::FLAT_SCR_LO:
4327 case AMDGPU::FLAT_SCR_HI: