40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
184 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
185 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
186 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
187 MVT::i1, MVT::v32i32},
191 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
192 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
193 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
194 MVT::i1, MVT::v32i32},
232 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
239 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
240 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
241 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
244 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
245 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
246 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
250 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
251 MVT::v3i16, MVT::v4i16, MVT::Other},
256 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
272 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
273 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
274 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
275 MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
276 MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
277 MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
278 MVT::v32i32, MVT::v32f32}) {
310 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
324 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
338 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
352 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
366 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
381 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
389 {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
390 MVT::v4i16, MVT::v4f16},
395 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
399 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
400 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
401 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
402 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
479 {MVT::f32, MVT::f64},
Legal);
556 for (
MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
557 MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
665 {MVT::v4f16, MVT::v8f16, MVT::v16f16},
Custom);
668 {MVT::v4f16, MVT::v8f16, MVT::v16f16},
Expand);
670 for (
MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
692 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
693 MVT::v16f16, MVT::v16i16},
696 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
704 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
719 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
739 {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
740 MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
749 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
750 MVT::v2i16, MVT::v2f16, MVT::i128},
754 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
755 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
756 MVT::i16, MVT::i8, MVT::i128},
760 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
761 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
850 EVT DestVT,
EVT SrcVT)
const {
860 LLT DestTy,
LLT SrcTy)
const {
861 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
862 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
888 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
890 return VT.
isInteger() ? MVT::i32 : MVT::f32;
917 return (NumElts + 1) / 2;
923 return NumElts * ((
Size + 31) / 32);
932 EVT VT,
EVT &IntermediateVT,
933 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
942 if (ScalarVT == MVT::bf16) {
943 RegisterVT = MVT::i32;
944 IntermediateVT = MVT::v2bf16;
946 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
947 IntermediateVT = RegisterVT;
949 NumIntermediates = (NumElts + 1) / 2;
950 return NumIntermediates;
955 IntermediateVT = RegisterVT;
956 NumIntermediates = NumElts;
957 return NumIntermediates;
960 if (Size < 16 && Subtarget->has16BitInsts()) {
962 RegisterVT = MVT::i16;
963 IntermediateVT = ScalarVT;
964 NumIntermediates = NumElts;
965 return NumIntermediates;
970 RegisterVT = MVT::i32;
971 IntermediateVT = ScalarVT;
972 NumIntermediates = NumElts;
973 return NumIntermediates;
977 RegisterVT = MVT::i32;
978 IntermediateVT = RegisterVT;
979 NumIntermediates = NumElts * ((
Size + 31) / 32);
980 return NumIntermediates;
985 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
991 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
992 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1003 auto *ST = dyn_cast<StructType>(Ty);
1008 assert(ST->getNumContainedTypes() == 2 &&
1009 ST->getContainedType(1)->isIntegerTy(32));
1036 unsigned IntrID)
const {
1038 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1052 if (RsrcIntr->IsImage)
1056 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1063 Info.ptrVal = RsrcArg;
1068 unsigned MaxNumLanes = 4;
1070 if (RsrcIntr->IsImage) {
1094 if (RsrcIntr->IsImage) {
1095 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1117 case Intrinsic::amdgcn_raw_buffer_load_lds:
1118 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1119 case Intrinsic::amdgcn_struct_buffer_load_lds:
1120 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1121 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1131 case Intrinsic::amdgcn_ds_ordered_add:
1132 case Intrinsic::amdgcn_ds_ordered_swap:
1133 case Intrinsic::amdgcn_ds_fadd:
1134 case Intrinsic::amdgcn_ds_fmin:
1135 case Intrinsic::amdgcn_ds_fmax: {
1148 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1156 if (!Vol || !Vol->
isZero())
1161 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1162 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1165 Info.ptrVal =
nullptr;
1170 case Intrinsic::amdgcn_ds_append:
1171 case Intrinsic::amdgcn_ds_consume: {
1184 case Intrinsic::amdgcn_global_atomic_csub: {
1194 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1204 case Intrinsic::amdgcn_global_atomic_fadd:
1205 case Intrinsic::amdgcn_global_atomic_fmin:
1206 case Intrinsic::amdgcn_global_atomic_fmax:
1207 case Intrinsic::amdgcn_flat_atomic_fadd:
1208 case Intrinsic::amdgcn_flat_atomic_fmin:
1209 case Intrinsic::amdgcn_flat_atomic_fmax:
1210 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1211 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1222 case Intrinsic::amdgcn_ds_gws_init:
1223 case Intrinsic::amdgcn_ds_gws_barrier:
1224 case Intrinsic::amdgcn_ds_gws_sema_v:
1225 case Intrinsic::amdgcn_ds_gws_sema_br:
1226 case Intrinsic::amdgcn_ds_gws_sema_p:
1227 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1237 Info.memVT = MVT::i32;
1241 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1247 case Intrinsic::amdgcn_global_load_lds: {
1249 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1255 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1265 Info.memVT = MVT::i32;
1279 Type *&AccessTy)
const {
1281 case Intrinsic::amdgcn_ds_ordered_add:
1282 case Intrinsic::amdgcn_ds_ordered_swap:
1283 case Intrinsic::amdgcn_ds_append:
1284 case Intrinsic::amdgcn_ds_consume:
1285 case Intrinsic::amdgcn_ds_fadd:
1286 case Intrinsic::amdgcn_ds_fmin:
1287 case Intrinsic::amdgcn_ds_fmax:
1288 case Intrinsic::amdgcn_global_atomic_fadd:
1289 case Intrinsic::amdgcn_flat_atomic_fadd:
1290 case Intrinsic::amdgcn_flat_atomic_fmin:
1291 case Intrinsic::amdgcn_flat_atomic_fmax:
1292 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1293 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1294 case Intrinsic::amdgcn_global_atomic_csub: {
1305bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM)
const {
1309 return AM.BaseOffs == 0 && AM.Scale == 0;
1312 return AM.Scale == 0 &&
1313 (AM.BaseOffs == 0 ||
1320 return AM.
Scale == 0 &&
1335 return isLegalFlatAddressingMode(AM);
1338 return isLegalMUBUFAddressingMode(AM);
1341bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1363 if (AM.HasBaseReg) {
1393 return isLegalMUBUFAddressingMode(AM);
1399 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1432 return isLegalMUBUFAddressingMode(AM);
1458 return isLegalFlatAddressingMode(AM);
1479 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1493 Alignment < RequiredAlignment)
1514 RequiredAlignment =
Align(4);
1532 *IsFast = (Alignment >= RequiredAlignment) ? 64
1533 : (Alignment <
Align(4)) ? 32
1555 *IsFast = (Alignment >= RequiredAlignment) ? 96
1556 : (Alignment <
Align(4)) ? 32
1569 RequiredAlignment =
Align(8);
1580 *IsFast = (Alignment >= RequiredAlignment) ? 128
1581 : (Alignment <
Align(4)) ? 32
1598 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1600 return Alignment >= RequiredAlignment ||
1605 bool AlignedBy4 = Alignment >=
Align(4);
1607 *IsFast = AlignedBy4;
1609 return AlignedBy4 ||
1619 bool AlignedBy4 = Alignment >=
Align(4);
1621 *IsFast = AlignedBy4;
1632 return Alignment >=
Align(4) ||
1646 return Size >= 32 && Alignment >=
Align(4);
1651 unsigned *IsFast)
const {
1653 Alignment, Flags, IsFast);
1663 if (
Op.size() >= 16 &&
1667 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1675 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1685 unsigned DestAS)
const {
1693 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1697 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1717 unsigned Index)
const {
1764 std::tie(InputPtrReg, RC, ArgTy) =
1774 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1780 const SDLoc &SL)
const {
1787 const SDLoc &SL)
const {
1790 std::optional<uint32_t> KnownSize =
1792 if (KnownSize.has_value())
1819 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1828SDValue SITargetLowering::lowerKernargMemParameter(
1840 int64_t OffsetDiff =
Offset - AlignDownOffset;
1846 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1856 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
1867 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
1914 ExtType, SL, VA.
getLocVT(), Chain, FIN,
1950 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
1954 "vector type argument should have been split");
1959 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
1968 "unexpected vector split in ps argument type");
1982 Info->markPSInputAllocated(PSInputNum);
1984 Info->markPSInputEnabled(PSInputNum);
2001 if (
Info.hasWorkItemIDX()) {
2003 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2007 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2011 if (
Info.hasWorkItemIDY()) {
2017 unsigned Reg = AMDGPU::VGPR1;
2018 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2025 if (
Info.hasWorkItemIDZ()) {
2031 unsigned Reg = AMDGPU::VGPR2;
2032 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2051 if (RegIdx == ArgVGPRs.
size()) {
2058 unsigned Reg = ArgVGPRs[RegIdx];
2060 assert(Reg != AMDGPU::NoRegister);
2070 unsigned NumArgRegs) {
2073 if (RegIdx == ArgSGPRs.
size())
2076 unsigned Reg = ArgSGPRs[RegIdx];
2078 assert(Reg != AMDGPU::NoRegister);
2092 assert(Reg != AMDGPU::NoRegister);
2118 const unsigned Mask = 0x3ff;
2121 if (
Info.hasWorkItemIDX()) {
2123 Info.setWorkItemIDX(Arg);
2126 if (
Info.hasWorkItemIDY()) {
2128 Info.setWorkItemIDY(Arg);
2131 if (
Info.hasWorkItemIDZ())
2143 const unsigned Mask = 0x3ff;
2168 if (
Info.hasImplicitArgPtr())
2176 if (
Info.hasWorkGroupIDX())
2179 if (
Info.hasWorkGroupIDY())
2182 if (
Info.hasWorkGroupIDZ())
2185 if (
Info.hasLDSKernelId())
2197 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2204 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2210 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2218 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2233 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2239 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2243 if (
Info.hasLDSKernelId()) {
2245 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2258 bool IsShader)
const {
2266 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2268 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2272 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2273 Info.hasWorkGroupIDY() +
2274 Info.hasWorkGroupIDZ() +
2275 Info.hasWorkGroupInfo();
2276 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2278 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2283 if (
Info.hasWorkGroupIDX()) {
2284 Register Reg =
Info.addWorkGroupIDX(HasArchitectedSGPRs);
2285 if (!HasArchitectedSGPRs)
2286 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2291 if (
Info.hasWorkGroupIDY()) {
2292 Register Reg =
Info.addWorkGroupIDY(HasArchitectedSGPRs);
2293 if (!HasArchitectedSGPRs)
2294 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2299 if (
Info.hasWorkGroupIDZ()) {
2300 Register Reg =
Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2301 if (!HasArchitectedSGPRs)
2302 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2307 if (
Info.hasWorkGroupInfo()) {
2309 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2313 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2315 unsigned PrivateSegmentWaveByteOffsetReg;
2318 PrivateSegmentWaveByteOffsetReg =
2319 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2323 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2325 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2328 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2330 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2331 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2335 Info.getNumPreloadedSGPRs() >= 16);
2350 if (HasStackObjects)
2351 Info.setHasNonSpillStackObjects(
true);
2356 HasStackObjects =
true;
2360 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2362 if (!ST.enableFlatScratch()) {
2363 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2370 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2372 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2382 Info.setScratchRSrcReg(ReservedBufferReg);
2401 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2402 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2409 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2410 if (!
MRI.isLiveIn(Reg)) {
2411 Info.setStackPtrOffsetReg(Reg);
2416 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2423 if (ST.getFrameLowering()->hasFP(MF)) {
2424 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2430 return !
Info->isEntryFunction();
2442 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2451 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2452 RC = &AMDGPU::SGPR_64RegClass;
2453 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2454 RC = &AMDGPU::SGPR_32RegClass;
2460 Entry->addLiveIn(*
I);
2465 for (
auto *Exit : Exits)
2467 TII->get(TargetOpcode::COPY), *
I)
2485 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2504 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2505 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2511 !
Info->hasWorkGroupIDZ());
2530 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2531 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2534 Info->markPSInputAllocated(0);
2535 Info->markPSInputEnabled(0);
2546 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2547 if ((PsInputBits & 0x7F) == 0 ||
2548 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2551 }
else if (IsKernel) {
2554 Splits.
append(Ins.begin(), Ins.end());
2560 }
else if (!IsGraphics) {
2581 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2591 if (IsEntryFunc && VA.
isMemLoc()) {
2613 SDValue Arg = lowerKernargMemParameter(
2614 DAG, VT, MemVT,
DL, Chain,
Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2618 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
2631 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
2632 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
2643 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
2644 RC = &AMDGPU::VGPR_32RegClass;
2645 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
2646 RC = &AMDGPU::SGPR_32RegClass;
2704 auto &ArgUsageInfo =
2709 Info->setBytesInStackArgArea(StackArgSize);
2711 return Chains.
empty() ? Chain :
2735 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
2736 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
2737 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
2760 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
2778 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.
size();
I !=
E;
2779 ++
I, ++RealRVLocIdx) {
2783 SDValue Arg = OutVals[RealRVLocIdx];
2811 if (!
Info->isEntryFunction()) {
2817 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2819 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2835 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
2852 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
2918 auto &ArgUsageInfo =
2920 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2949 std::tie(OutgoingArg, ArgRC, ArgTy) =
2957 std::tie(IncomingArg, IncomingArgRC, Ty) =
2959 assert(IncomingArgRC == ArgRC);
2962 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2970 InputReg = getImplicitArgPtr(DAG,
DL);
2972 std::optional<uint32_t> Id =
2974 if (Id.has_value()) {
2986 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
2990 unsigned SpecialArgOffset =
3004 std::tie(OutgoingArg, ArgRC, Ty) =
3007 std::tie(OutgoingArg, ArgRC, Ty) =
3010 std::tie(OutgoingArg, ArgRC, Ty) =
3025 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3026 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3027 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3044 InputReg = InputReg.
getNode() ?
3053 InputReg = InputReg.
getNode() ?
3057 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3058 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3068 IncomingArgX ? *IncomingArgX :
3069 IncomingArgY ? *IncomingArgY :
3070 *IncomingArgZ, ~0u);
3077 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3115 if (Callee->isDivergent())
3122 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3126 if (!CallerPreserved)
3129 bool CCMatch = CallerCC == CalleeCC;
3142 if (Arg.hasByValAttr())
3156 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3157 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3166 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3204 bool IsSibCall =
false;
3205 bool IsThisReturn =
false;
3210 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
3219 "unsupported call to variadic function ");
3227 "unsupported required tail call to function ");
3232 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3235 "site marked musttail");
3242 if (!TailCallOpt && IsTailCall)
3293 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3299 MVT PtrVT = MVT::i32;
3302 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3330 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3338 int32_t
Offset = LocMemOffset;
3345 unsigned OpSize = Flags.isByVal() ?
3351 ? Flags.getNonZeroByValAlign()
3378 if (Outs[i].Flags.isByVal()) {
3380 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3383 Outs[i].Flags.getNonZeroByValAlign(),
3391 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3397 if (!MemOpChains.
empty())
3403 for (
auto &RegToPass : RegsToPass) {
3405 RegToPass.second, InGlue);
3414 if (IsTailCall && !IsSibCall) {
3419 std::vector<SDValue> Ops;
3420 Ops.push_back(Chain);
3421 Ops.push_back(Callee);
3440 for (
auto &RegToPass : RegsToPass) {
3442 RegToPass.second.getValueType()));
3448 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3449 assert(Mask &&
"Missing call preserved mask for calling convention");
3453 Ops.push_back(InGlue);
3463 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3468 Chain = Call.getValue(0);
3469 InGlue = Call.getValue(1);
3471 uint64_t CalleePopBytes = NumBytes;
3479 InVals, IsThisReturn,
3480 IsThisReturn ? OutVals[0] :
SDValue());
3491 EVT VT =
Op.getValueType();
3506 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3517 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3518 if (Alignment && *Alignment > StackAlign) {
3539 if (isa<ConstantSDNode>(
Size))
3546 if (
Op.getValueType() != MVT::i32)
3565 assert(
Op.getValueType() == MVT::i32);
3574 Op.getOperand(0), IntrinID, GetRoundBothImm);
3608 SDValue RoundModeTimesNumBits =
3628 TableEntry, EnumOffset);
3636 .
Case(
"m0", AMDGPU::M0)
3637 .
Case(
"exec", AMDGPU::EXEC)
3638 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
3639 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
3640 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
3641 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3642 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3645 if (Reg == AMDGPU::NoRegister) {
3659 case AMDGPU::EXEC_LO:
3660 case AMDGPU::EXEC_HI:
3661 case AMDGPU::FLAT_SCR_LO:
3662 case AMDGPU::FLAT_SCR_HI:
3667 case AMDGPU::FLAT_SCR:
3686 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
3695static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3717 auto Next = std::next(
I);
3730 return std::pair(LoopBB, RemainderBB);
3737 auto I =
MI.getIterator();
3738 auto E = std::next(
I);
3760 Src->setIsKill(
false);
3776 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3779 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
3801 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
3802 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
3811 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
3812 Register NewExec =
MRI.createVirtualRegister(BoolRC);
3813 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3814 Register CondReg =
MRI.createVirtualRegister(BoolRC);
3822 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
3829 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3833 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3838 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3839 : AMDGPU::S_AND_SAVEEXEC_B64),
3843 MRI.setSimpleHint(NewExec, CondReg);
3845 if (UseGPRIdxMode) {
3847 SGPRIdxReg = CurrentIdxReg;
3849 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3850 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3857 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3860 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3867 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3869 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3870 : AMDGPU::S_XOR_B64_term), Exec)
3891 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
3892 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
3900 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3902 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
3903 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
3904 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3905 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3920 InitResultReg, DstReg, PhiReg, TmpExec,
3921 Offset, UseGPRIdxMode, SGPRIdxReg);
3938static std::pair<unsigned, int>
3943 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
3948 return std::pair(AMDGPU::sub0,
Offset);
3962 assert(
Idx->getReg() != AMDGPU::NoRegister);
3983 return Idx->getReg();
3985 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4002 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4003 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4012 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4015 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4019 if (UseGPRIdxMode) {
4026 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4039 MI.eraseFromParent();
4048 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4049 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4055 UseGPRIdxMode, SGPRIdxReg);
4059 if (UseGPRIdxMode) {
4061 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4063 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4068 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4073 MI.eraseFromParent();
4090 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4101 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4103 if (
Idx->getReg() == AMDGPU::NoRegister) {
4114 MI.eraseFromParent();
4119 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4123 if (UseGPRIdxMode) {
4127 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4136 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4137 TRI.getRegSizeInBits(*VecRC), 32,
false);
4143 MI.eraseFromParent();
4153 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4157 UseGPRIdxMode, SGPRIdxReg);
4160 if (UseGPRIdxMode) {
4162 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4164 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4170 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4171 TRI.getRegSizeInBits(*VecRC), 32,
false);
4172 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4178 MI.eraseFromParent();
4193 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4221 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4222 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4224 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4225 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4226 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4228 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4229 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4231 bool IsWave32 = ST.isWave32();
4232 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4233 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4238 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4241 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4246 I = ComputeLoop->end();
4248 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4252 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4253 .
addReg(TmpSReg->getOperand(0).getReg())
4257 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4258 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4259 .
addReg(ActiveBits->getOperand(0).getReg());
4260 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4261 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4263 .
addReg(FF1->getOperand(0).getReg());
4264 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4266 .
addReg(LaneValue->getOperand(0).getReg());
4269 unsigned BITSETOpc =
4270 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4271 auto NewActiveBits =
4272 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4273 .
addReg(FF1->getOperand(0).getReg())
4274 .
addReg(ActiveBits->getOperand(0).getReg());
4277 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4278 .addMBB(ComputeLoop);
4279 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4280 .addMBB(ComputeLoop);
4283 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4285 .
addReg(NewActiveBits->getOperand(0).getReg())
4287 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))