39#include "llvm/IR/IntrinsicsAMDGPU.h"
40#include "llvm/IR/IntrinsicsR600.h"
50#define DEBUG_TYPE "si-lower"
56 cl::desc(
"Do not align and prefetch loops"),
60 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
200 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
201 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
202 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
203 MVT::i1, MVT::v32i32},
272 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
279 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
280 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
281 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
285 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
286 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
290 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
291 MVT::v3i16, MVT::v4i16, MVT::Other},
296 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
312 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
313 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
314 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
315 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
316 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
317 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
318 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
319 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
351 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
365 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
379 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
393 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
407 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
422 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
431 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
432 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
441 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
525 {MVT::f32, MVT::f64},
Legal);
618 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
619 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
620 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
756 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
765 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
784 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
787 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
788 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
799 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
815 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
835 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
836 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
837 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
838 MVT::v32f16, MVT::v32bf16},
854 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
856 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
868 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
869 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
874 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
875 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
876 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
877 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
881 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
882 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
883 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
884 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
991 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1004 EVT DestVT,
EVT SrcVT)
const {
1014 LLT DestTy,
LLT SrcTy)
const {
1015 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
1016 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1042 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1044 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1071 return (NumElts + 1) / 2;
1077 return NumElts * ((
Size + 31) / 32);
1086 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1095 if (ScalarVT == MVT::bf16) {
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = MVT::v2bf16;
1099 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1100 IntermediateVT = RegisterVT;
1102 NumIntermediates = (NumElts + 1) / 2;
1103 return NumIntermediates;
1108 IntermediateVT = RegisterVT;
1109 NumIntermediates = NumElts;
1110 return NumIntermediates;
1113 if (Size < 16 && Subtarget->has16BitInsts()) {
1115 RegisterVT = MVT::i16;
1116 IntermediateVT = ScalarVT;
1117 NumIntermediates = NumElts;
1118 return NumIntermediates;
1122 RegisterVT = MVT::i32;
1123 IntermediateVT = ScalarVT;
1124 NumIntermediates = NumElts;
1125 return NumIntermediates;
1129 RegisterVT = MVT::i32;
1130 IntermediateVT = RegisterVT;
1131 NumIntermediates = NumElts * ((
Size + 31) / 32);
1132 return NumIntermediates;
1137 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1142 unsigned MaxNumLanes) {
1143 assert(MaxNumLanes != 0);
1146 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1147 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1158 unsigned MaxNumLanes) {
1159 auto *ST = dyn_cast<StructType>(Ty);
1164 assert(ST->getNumContainedTypes() == 2 &&
1165 ST->getContainedType(1)->isIntegerTy(32));
1180 DL.getPointerSizeInBits(AS) == 192)
1190 DL.getPointerSizeInBits(AS) == 160) ||
1192 DL.getPointerSizeInBits(AS) == 192))
1200 unsigned IntrID)
const {
1202 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1217 if (RsrcIntr->IsImage) {
1225 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1232 Info.ptrVal = RsrcArg;
1235 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1244 if (RsrcIntr->IsImage) {
1245 unsigned MaxNumLanes = 4;
1260 std::numeric_limits<unsigned>::max());
1270 if (RsrcIntr->IsImage) {
1271 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1291 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1293 Info.memVT = MVT::i32;
1300 case Intrinsic::amdgcn_raw_buffer_load_lds:
1301 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1302 case Intrinsic::amdgcn_struct_buffer_load_lds:
1303 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1304 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1309 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1310 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1311 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1312 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1315 std::numeric_limits<unsigned>::max());
1325 case Intrinsic::amdgcn_ds_ordered_add:
1326 case Intrinsic::amdgcn_ds_ordered_swap: {
1339 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1340 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1343 Info.ptrVal =
nullptr;
1348 case Intrinsic::amdgcn_ds_append:
1349 case Intrinsic::amdgcn_ds_consume: {
1362 case Intrinsic::amdgcn_global_atomic_csub: {
1371 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1381 case Intrinsic::amdgcn_global_atomic_fmin_num:
1382 case Intrinsic::amdgcn_global_atomic_fmax_num:
1383 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1384 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1385 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1386 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1396 case Intrinsic::amdgcn_global_load_tr_b64:
1397 case Intrinsic::amdgcn_global_load_tr_b128:
1398 case Intrinsic::amdgcn_ds_read_tr4_b64:
1399 case Intrinsic::amdgcn_ds_read_tr6_b96:
1400 case Intrinsic::amdgcn_ds_read_tr8_b64:
1401 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1409 case Intrinsic::amdgcn_ds_gws_init:
1410 case Intrinsic::amdgcn_ds_gws_barrier:
1411 case Intrinsic::amdgcn_ds_gws_sema_v:
1412 case Intrinsic::amdgcn_ds_gws_sema_br:
1413 case Intrinsic::amdgcn_ds_gws_sema_p:
1414 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1424 Info.memVT = MVT::i32;
1428 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1434 case Intrinsic::amdgcn_global_load_lds: {
1436 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1442 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1452 Info.memVT = MVT::i32;
1459 case Intrinsic::amdgcn_s_prefetch_data: {
1474 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1477 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1478 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1490 Type *&AccessTy)
const {
1492 switch (
II->getIntrinsicID()) {
1493 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1494 case Intrinsic::amdgcn_ds_append:
1495 case Intrinsic::amdgcn_ds_consume:
1496 case Intrinsic::amdgcn_ds_read_tr4_b64:
1497 case Intrinsic::amdgcn_ds_read_tr6_b96:
1498 case Intrinsic::amdgcn_ds_read_tr8_b64:
1499 case Intrinsic::amdgcn_ds_read_tr16_b64:
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap:
1502 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1503 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1504 case Intrinsic::amdgcn_global_atomic_csub:
1505 case Intrinsic::amdgcn_global_atomic_fmax_num:
1506 case Intrinsic::amdgcn_global_atomic_fmin_num:
1507 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1508 case Intrinsic::amdgcn_global_load_tr_b64:
1509 case Intrinsic::amdgcn_global_load_tr_b128:
1510 Ptr =
II->getArgOperand(0);
1512 case Intrinsic::amdgcn_global_load_lds:
1513 Ptr =
II->getArgOperand(1);
1518 AccessTy =
II->getType();
1524 unsigned AddrSpace)
const {
1536 return AM.
Scale == 0 &&
1538 AM.
BaseOffs, AddrSpace, FlatVariant));
1558 return isLegalMUBUFAddressingMode(AM);
1561bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1572 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1584 if (AM.HasBaseReg) {
1616 return isLegalMUBUFAddressingMode(AM);
1623 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1673 : isLegalMUBUFAddressingMode(AM);
1720 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1732 Align RequiredAlignment(
1735 Alignment < RequiredAlignment)
1756 RequiredAlignment =
Align(4);
1774 *IsFast = (Alignment >= RequiredAlignment) ? 64
1775 : (Alignment <
Align(4)) ? 32
1797 *IsFast = (Alignment >= RequiredAlignment) ? 96
1798 : (Alignment <
Align(4)) ? 32
1811 RequiredAlignment =
Align(8);
1822 *IsFast = (Alignment >= RequiredAlignment) ? 128
1823 : (Alignment <
Align(4)) ? 32
1840 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1842 return Alignment >= RequiredAlignment ||
1851 bool AlignedBy4 = Alignment >=
Align(4);
1853 *IsFast = AlignedBy4;
1864 return Alignment >=
Align(4) ||
1878 return Size >= 32 && Alignment >=
Align(4);
1883 unsigned *IsFast)
const {
1885 Alignment, Flags, IsFast);
1895 if (
Op.size() >= 16 &&
1899 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1907 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1917 unsigned DestAS)
const {
1925 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1943 unsigned Index)
const {
1979 auto [InputPtrReg, RC, ArgTy] =
1989 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1995 const SDLoc &SL)
const {
2002 const SDLoc &SL)
const {
2005 std::optional<uint32_t> KnownSize =
2007 if (KnownSize.has_value())
2033 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2042SDValue SITargetLowering::lowerKernargMemParameter(
2054 int64_t OffsetDiff =
Offset - AlignDownOffset;
2060 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2070 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2080 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2128 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2133SDValue SITargetLowering::getPreloadedValue(
2155 Reg = &WorkGroupIDX;
2156 RC = &AMDGPU::SReg_32RegClass;
2160 Reg = &WorkGroupIDY;
2161 RC = &AMDGPU::SReg_32RegClass;
2165 Reg = &WorkGroupIDZ;
2166 RC = &AMDGPU::SReg_32RegClass;
2197 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2201 "vector type argument should have been split");
2206 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2214 "unexpected vector split in ps argument type");
2228 Info->markPSInputAllocated(PSInputNum);
2230 Info->markPSInputEnabled(PSInputNum);
2246 if (
Info.hasWorkItemIDX()) {
2256 if (
Info.hasWorkItemIDY()) {
2259 Info.setWorkItemIDY(
2262 unsigned Reg = AMDGPU::VGPR1;
2270 if (
Info.hasWorkItemIDZ()) {
2273 Info.setWorkItemIDZ(
2276 unsigned Reg = AMDGPU::VGPR2;
2296 if (RegIdx == ArgVGPRs.
size()) {
2303 unsigned Reg = ArgVGPRs[RegIdx];
2305 assert(Reg != AMDGPU::NoRegister);
2315 unsigned NumArgRegs) {
2318 if (RegIdx == ArgSGPRs.
size())
2321 unsigned Reg = ArgSGPRs[RegIdx];
2323 assert(Reg != AMDGPU::NoRegister);
2337 assert(Reg != AMDGPU::NoRegister);
2363 const unsigned Mask = 0x3ff;
2366 if (
Info.hasWorkItemIDX()) {
2368 Info.setWorkItemIDX(Arg);
2371 if (
Info.hasWorkItemIDY()) {
2373 Info.setWorkItemIDY(Arg);
2376 if (
Info.hasWorkItemIDZ())
2388 const unsigned Mask = 0x3ff;
2409 if (
Info.hasImplicitArgPtr())
2417 if (
Info.hasWorkGroupIDX())
2420 if (
Info.hasWorkGroupIDY())
2423 if (
Info.hasWorkGroupIDZ())
2426 if (
Info.hasLDSKernelId())
2438 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2445 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2451 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2457 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2472 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2478 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2484 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2501 bool InPreloadSequence =
true;
2503 bool AlignedForImplictArgs =
false;
2504 unsigned ImplicitArgOffset = 0;
2505 for (
auto &Arg :
F.args()) {
2506 if (!InPreloadSequence || !Arg.hasInRegAttr())
2509 unsigned ArgIdx = Arg.getArgNo();
2512 if (InIdx < Ins.size() &&
2513 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2516 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2517 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2519 assert(ArgLocs[ArgIdx].isMemLoc());
2520 auto &ArgLoc = ArgLocs[InIdx];
2522 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2524 unsigned NumAllocSGPRs =
2525 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2528 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2529 if (!AlignedForImplictArgs) {
2531 alignTo(LastExplicitArgOffset,
2533 LastExplicitArgOffset;
2534 AlignedForImplictArgs =
true;
2536 ArgOffset += ImplicitArgOffset;
2540 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2541 assert(InIdx >= 1 &&
"No previous SGPR");
2542 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2543 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2547 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2548 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2551 InPreloadSequence =
false;
2557 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2559 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2561 if (PreloadRegs->
size() > 1)
2562 RC = &AMDGPU::SGPR_32RegClass;
2563 for (
auto &Reg : *PreloadRegs) {
2569 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2578 if (
Info.hasLDSKernelId()) {
2580 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2589 bool IsShader)
const {
2597 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2599 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2603 unsigned NumRequiredSystemSGPRs =
2604 Info.hasWorkGroupIDX() +
Info.hasWorkGroupIDY() +
2605 Info.hasWorkGroupIDZ() +
Info.hasWorkGroupInfo();
2606 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2608 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2613 if (!HasArchitectedSGPRs) {
2614 if (
Info.hasWorkGroupIDX()) {
2616 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 if (
Info.hasWorkGroupIDY()) {
2622 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626 if (
Info.hasWorkGroupIDZ()) {
2628 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2633 if (
Info.hasWorkGroupInfo()) {
2635 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2639 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2641 unsigned PrivateSegmentWaveByteOffsetReg;
2644 PrivateSegmentWaveByteOffsetReg =
2645 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2649 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2651 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2654 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2656 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2657 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2661 Info.getNumPreloadedSGPRs() >= 16);
2676 if (HasStackObjects)
2677 Info.setHasNonSpillStackObjects(
true);
2682 HasStackObjects =
true;
2686 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2688 if (!ST.enableFlatScratch()) {
2689 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2696 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2698 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2708 Info.setScratchRSrcReg(ReservedBufferReg);
2727 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2728 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2735 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2736 if (!
MRI.isLiveIn(Reg)) {
2737 Info.setStackPtrOffsetReg(Reg);
2742 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2749 if (ST.getFrameLowering()->hasFP(MF)) {
2750 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2756 return !
Info->isEntryFunction();
2766 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2775 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2776 RC = &AMDGPU::SGPR_64RegClass;
2777 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2778 RC = &AMDGPU::SGPR_32RegClass;
2784 Entry->addLiveIn(*
I);
2789 for (
auto *Exit : Exits)
2791 TII->get(TargetOpcode::COPY), *
I)
2809 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2828 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2829 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2837 !
Info->hasWorkGroupIDZ());
2856 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2857 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2860 Info->markPSInputAllocated(0);
2861 Info->markPSInputEnabled(0);
2872 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2873 if ((PsInputBits & 0x7F) == 0 ||
2874 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2877 }
else if (IsKernel) {
2880 Splits.
append(Ins.begin(), Ins.end());
2893 }
else if (!IsGraphics) {
2918 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2928 if (IsEntryFunc && VA.
isMemLoc()) {
2951 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2955 int64_t OffsetDiff =
Offset - AlignDownOffset;
2962 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2973 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2974 Ins[i].Flags.isSExt(), &Ins[i]);
2982 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2985 if (PreloadRegs.
size() == 1) {
2986 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2991 TRI->getRegSizeInBits(*RC)));
2999 for (
auto Reg : PreloadRegs) {
3006 PreloadRegs.size()),
3023 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3024 Ins[i].Flags.isSExt(), &Ins[i]);
3036 "hidden argument in kernel signature was not preloaded",
3043 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3044 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3049 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3064 if (!IsEntryFunc && VA.
isMemLoc()) {
3065 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3076 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3077 RC = &AMDGPU::VGPR_32RegClass;
3078 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3079 RC = &AMDGPU::SGPR_32RegClass;
3139 Info->setBytesInStackArgArea(StackArgSize);
3141 return Chains.
empty() ? Chain
3157 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3163 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3164 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3165 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3188 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3206 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3207 ++
I, ++RealRVLocIdx) {
3211 SDValue Arg = OutVals[RealRVLocIdx];
3239 if (!
Info->isEntryFunction()) {
3245 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3247 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3263 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3346 auto &ArgUsageInfo =
3348 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3374 const auto [OutgoingArg, ArgRC, ArgTy] =
3379 const auto [IncomingArg, IncomingArgRC, Ty] =
3381 assert(IncomingArgRC == ArgRC);
3384 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3392 InputReg = getImplicitArgPtr(DAG,
DL);
3394 std::optional<uint32_t> Id =
3396 if (Id.has_value()) {
3407 if (OutgoingArg->isRegister()) {
3408 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3409 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3412 unsigned SpecialArgOffset =
3423 auto [OutgoingArg, ArgRC, Ty] =
3426 std::tie(OutgoingArg, ArgRC, Ty) =
3429 std::tie(OutgoingArg, ArgRC, Ty) =
3444 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3445 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3446 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3478 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3479 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3490 : IncomingArgY ? *IncomingArgY
3497 if (OutgoingArg->isRegister()) {
3499 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3540 if (Callee->isDivergent())
3547 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3551 if (!CallerPreserved)
3554 bool CCMatch = CallerCC == CalleeCC;
3567 if (Arg.hasByValAttr())
3581 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3582 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3591 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3604 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
3606 if (!CCVA.isRegLoc())
3611 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3613 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
3642 if (IsChainCallConv) {
3646 RequestedExec = CLI.
Args.back();
3647 assert(RequestedExec.
Node &&
"No node for EXEC");
3652 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3653 CLI.
Outs.pop_back();
3657 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3658 CLI.
Outs.pop_back();
3663 "Haven't popped all the pieces of the EXEC mask");
3674 bool IsSibCall =
false;
3688 "unsupported call to variadic function ");
3696 "unsupported required tail call to function ");
3701 Outs, OutVals, Ins, DAG);
3705 "site marked musttail or on llvm.amdgcn.cs.chain");
3712 if (!TailCallOpt && IsTailCall)
3758 if (!IsSibCall || IsChainCallConv) {
3765 RegsToPass.emplace_back(IsChainCallConv
3766 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3767 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3774 const unsigned NumSpecialInputs = RegsToPass.size();
3776 MVT PtrVT = MVT::i32;
3779 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3807 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3815 int32_t
Offset = LocMemOffset;
3822 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
3828 ? Flags.getNonZeroByValAlign()
3855 if (Outs[i].Flags.isByVal()) {
3857 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3860 Outs[i].Flags.getNonZeroByValAlign(),
3862 nullptr, std::nullopt, DstInfo,
3868 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3874 if (!MemOpChains.
empty())
3890 unsigned ArgIdx = 0;
3891 for (
auto [Reg, Val] : RegsToPass) {
3892 if (ArgIdx++ >= NumSpecialInputs &&
3893 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
3919 if (IsTailCall && !IsSibCall) {
3924 std::vector<SDValue> Ops({Chain});
3930 Ops.push_back(Callee);
3947 Ops.push_back(Callee);
3958 if (IsChainCallConv)
3959 Ops.push_back(RequestedExec.
Node);
3963 for (
auto &[Reg, Val] : RegsToPass)
3967 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3968 assert(Mask &&
"Missing call preserved mask for calling convention");
3978 MVT::Glue, GlueOps),
3983 Ops.push_back(InGlue);
4000 return DAG.
getNode(OPC,
DL, MVT::Other, Ops);
4005 Chain = Call.getValue(0);
4006 InGlue = Call.getValue(1);
4008 uint64_t CalleePopBytes = NumBytes;
4028 EVT VT =
Op.getValueType();
4042 Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
4046 "Stack grows upwards for AMDGPU");
4048 Chain = BaseAddr.getValue(1);
4050 if (Alignment > StackAlign) {
4053 uint64_t StackAlignMask = ScaledAlignment - 1;
4080 if (isa<ConstantSDNode>(
Size))
4087 if (
Op.getValueType() != MVT::i32)
4106 assert(
Op.getValueType() == MVT::i32);
4115 Op.getOperand(0), IntrinID, GetRoundBothImm);
4149 SDValue RoundModeTimesNumBits =
4169 TableEntry, EnumOffset);
4183 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4185 static_cast<uint32_t>(ConstMode->getZExtValue()),
4197 if (UseReducedTable) {
4203 SDValue RoundModeTimesNumBits =
4223 SDValue RoundModeTimesNumBits =
4232 NewMode = TruncTable;
4241 ReadFirstLaneID, NewMode);
4254 IntrinID, RoundBothImm, NewMode);
4260 if (
Op->isDivergent())
4279 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4280 EVT SrcVT = Src.getValueType();
4289 EVT DstVT =
Op.getValueType();
4298 if (
Op.getValueType() != MVT::i64)
4312 Op.getOperand(0), IntrinID, ModeHwRegImm);
4314 Op.getOperand(0), IntrinID, TrapHwRegImm);
4328 if (
Op.getOperand(1).getValueType() != MVT::i64)