35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
43#define DEBUG_TYPE "si-lower"
48 "amdgpu-disable-loop-alignment",
49 cl::desc(
"Do not align and prefetch loops"),
53 "amdgpu-use-divergent-register-indexing",
55 cl::desc(
"Use indirect register addressing for divergent indexes"),
60 return Info->getMode().allFP32Denormals();
65 return Info->getMode().allFP64FP16Denormals();
69 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
70 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
72 return AMDGPU::SGPR0 + Reg;
819 EVT DestVT,
EVT SrcVT)
const {
829 LLT DestTy,
LLT SrcTy)
const {
830 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
831 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
886 return (NumElts + 1) / 2;
892 return NumElts * ((
Size + 31) / 32);
901 EVT VT,
EVT &IntermediateVT,
902 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
916 IntermediateVT = RegisterVT;
918 NumIntermediates = (NumElts + 1) / 2;
919 return NumIntermediates;
924 IntermediateVT = RegisterVT;
925 NumIntermediates = NumElts;
926 return NumIntermediates;
929 if (Size < 16 && Subtarget->has16BitInsts()) {
932 IntermediateVT = ScalarVT;
933 NumIntermediates = NumElts;
934 return NumIntermediates;
940 IntermediateVT = ScalarVT;
941 NumIntermediates = NumElts;
942 return NumIntermediates;
947 IntermediateVT = RegisterVT;
948 NumIntermediates = NumElts * ((
Size + 31) / 32);
949 return NumIntermediates;
954 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
960 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
961 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
972 auto *ST = dyn_cast<StructType>(Ty);
977 assert(ST->getNumContainedTypes() == 2 &&
978 ST->getContainedType(1)->isIntegerTy(32));
985 unsigned IntrID)
const {
987 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1001 if (RsrcIntr->IsImage)
1006 unsigned MaxNumLanes = 4;
1008 if (RsrcIntr->IsImage) {
1032 if (RsrcIntr->IsImage) {
1033 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1055 case Intrinsic::amdgcn_raw_buffer_load_lds:
1056 case Intrinsic::amdgcn_struct_buffer_load_lds: {
1057 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1067 case Intrinsic::amdgcn_atomic_inc:
1068 case Intrinsic::amdgcn_atomic_dec:
1069 case Intrinsic::amdgcn_ds_ordered_add:
1070 case Intrinsic::amdgcn_ds_ordered_swap:
1071 case Intrinsic::amdgcn_ds_fadd:
1072 case Intrinsic::amdgcn_ds_fmin:
1073 case Intrinsic::amdgcn_ds_fmax: {
1086 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1094 if (!Vol || !Vol->
isZero())
1099 case Intrinsic::amdgcn_ds_append:
1100 case Intrinsic::amdgcn_ds_consume: {
1113 case Intrinsic::amdgcn_global_atomic_csub: {
1123 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1133 case Intrinsic::amdgcn_global_atomic_fadd:
1134 case Intrinsic::amdgcn_global_atomic_fmin:
1135 case Intrinsic::amdgcn_global_atomic_fmax:
1136 case Intrinsic::amdgcn_flat_atomic_fadd:
1137 case Intrinsic::amdgcn_flat_atomic_fmin:
1138 case Intrinsic::amdgcn_flat_atomic_fmax:
1139 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1140 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1151 case Intrinsic::amdgcn_ds_gws_init:
1152 case Intrinsic::amdgcn_ds_gws_barrier:
1153 case Intrinsic::amdgcn_ds_gws_sema_v:
1154 case Intrinsic::amdgcn_ds_gws_sema_br:
1155 case Intrinsic::amdgcn_ds_gws_sema_p:
1156 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1170 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1176 case Intrinsic::amdgcn_global_load_lds: {
1178 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1184 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1208 Type *&AccessTy)
const {
1210 case Intrinsic::amdgcn_atomic_inc:
1211 case Intrinsic::amdgcn_atomic_dec:
1212 case Intrinsic::amdgcn_ds_ordered_add:
1213 case Intrinsic::amdgcn_ds_ordered_swap:
1214 case Intrinsic::amdgcn_ds_append:
1215 case Intrinsic::amdgcn_ds_consume:
1216 case Intrinsic::amdgcn_ds_fadd:
1217 case Intrinsic::amdgcn_ds_fmin:
1218 case Intrinsic::amdgcn_ds_fmax:
1219 case Intrinsic::amdgcn_global_atomic_fadd:
1220 case Intrinsic::amdgcn_flat_atomic_fadd:
1221 case Intrinsic::amdgcn_flat_atomic_fmin:
1222 case Intrinsic::amdgcn_flat_atomic_fmax:
1223 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1224 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1225 case Intrinsic::amdgcn_global_atomic_csub: {
1236bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM)
const {
1240 return AM.BaseOffs == 0 && AM.Scale == 0;
1243 return AM.Scale == 0 &&
1244 (AM.BaseOffs == 0 ||
1251 return AM.
Scale == 0 &&
1266 return isLegalFlatAddressingMode(AM);
1269 return isLegalMUBUFAddressingMode(AM);
1272bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1294 if (AM.HasBaseReg) {
1324 return isLegalMUBUFAddressingMode(AM);
1330 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1358 return isLegalMUBUFAddressingMode(AM);
1382 return isLegalFlatAddressingMode(AM);
1403 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1417 Alignment < RequiredAlignment)
1438 RequiredAlignment =
Align(4);
1456 *IsFast = (Alignment >= RequiredAlignment) ? 64
1457 : (Alignment <
Align(4)) ? 32
1479 *IsFast = (Alignment >= RequiredAlignment) ? 96
1480 : (Alignment <
Align(4)) ? 32
1493 RequiredAlignment =
Align(8);
1504 *IsFast = (Alignment >= RequiredAlignment) ? 128
1505 : (Alignment <
Align(4)) ? 32
1522 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1524 return Alignment >= RequiredAlignment ||
1529 bool AlignedBy4 = Alignment >=
Align(4);
1531 *IsFast = AlignedBy4;
1533 return AlignedBy4 ||
1543 bool AlignedBy4 = Alignment >=
Align(4);
1545 *IsFast = AlignedBy4;
1556 return Alignment >=
Align(4) ||
1570 return Size >= 32 && Alignment >=
Align(4);
1575 unsigned *IsFast)
const {
1577 Alignment,
Flags, IsFast);
1587 if (Op.size() >= 16 &&
1588 Op.isDstAligned(
Align(4)))
1591 if (Op.size() >= 8 && Op.isDstAligned(
Align(4)))
1599 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1609 unsigned DestAS)
const {
1617 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1621 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1641 unsigned Index)
const {
1688 std::tie(InputPtrReg, RC, ArgTy) =
1698 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1704 const SDLoc &SL)
const {
1711 const SDLoc &SL)
const {
1714 std::optional<uint32_t> KnownSize =
1716 if (KnownSize.has_value())
1736 if (
Arg && (
Arg->Flags.isSExt() ||
Arg->Flags.isZExt()) &&
1743 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1752SDValue SITargetLowering::lowerKernargMemParameter(
1764 int64_t OffsetDiff =
Offset - AlignDownOffset;
1770 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1780 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed,
Arg);
1801 if (
Arg.Flags.isByVal()) {
1802 unsigned Size =
Arg.Flags.getByValSize();
1838 ExtType, SL, VA.
getLocVT(), Chain, FIN,
1874 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
1877 assert((!
Arg->VT.isVector() ||
Arg->VT.getScalarSizeInBits() == 16) &&
1878 "vector type argument should have been split");
1882 !
Arg->Flags.isInReg() && PSInputNum <= 15) {
1883 bool SkipArg = !
Arg->Used && !
Info->isPSInputAllocated(PSInputNum);
1888 if (
Arg->Flags.isSplit()) {
1889 while (!
Arg->Flags.isSplitEnd()) {
1891 Arg->VT.getScalarSizeInBits() == 16) &&
1892 "unexpected vector split in ps argument type");
1901 Skipped.
set(
Arg->getOrigArgIndex());
1906 Info->markPSInputAllocated(PSInputNum);
1908 Info->markPSInputEnabled(PSInputNum);
1925 if (
Info.hasWorkItemIDX()) {
1927 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1931 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1935 if (
Info.hasWorkItemIDY()) {
1941 unsigned Reg = AMDGPU::VGPR1;
1942 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1949 if (
Info.hasWorkItemIDZ()) {
1955 unsigned Reg = AMDGPU::VGPR2;
1956 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1975 if (RegIdx == ArgVGPRs.
size()) {
1982 unsigned Reg = ArgVGPRs[RegIdx];
1984 assert(Reg != AMDGPU::NoRegister);
1994 unsigned NumArgRegs) {
1997 if (RegIdx == ArgSGPRs.
size())
2000 unsigned Reg = ArgSGPRs[RegIdx];
2002 assert(Reg != AMDGPU::NoRegister);
2016 assert(Reg != AMDGPU::NoRegister);
2042 const unsigned Mask = 0x3ff;
2045 if (
Info.hasWorkItemIDX()) {
2050 if (
Info.hasWorkItemIDY()) {
2055 if (
Info.hasWorkItemIDZ())
2067 const unsigned Mask = 0x3ff;
2081 if (
Info.hasDispatchPtr())
2085 if (
Info.hasQueuePtr() &&
2091 if (
Info.hasImplicitArgPtr())
2094 if (
Info.hasDispatchID())
2099 if (
Info.hasWorkGroupIDX())
2102 if (
Info.hasWorkGroupIDY())
2105 if (
Info.hasWorkGroupIDZ())
2108 if (
Info.hasLDSKernelId())
2117 if (
Info.hasImplicitBufferPtr()) {
2119 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2124 if (
Info.hasPrivateSegmentBuffer()) {
2126 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2130 if (
Info.hasDispatchPtr()) {
2132 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2137 if (
Info.hasQueuePtr() &&
2140 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2144 if (
Info.hasKernargSegmentPtr()) {
2153 if (
Info.hasDispatchID()) {
2155 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2161 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2165 if (
Info.hasLDSKernelId()) {
2167 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2180 bool IsShader)
const {
2188 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2190 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2194 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2195 Info.hasWorkGroupIDY() +
2196 Info.hasWorkGroupIDZ() +
2197 Info.hasWorkGroupInfo();
2198 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2200 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2205 if (
Info.hasWorkGroupIDX()) {
2206 Register Reg =
Info.addWorkGroupIDX(HasArchitectedSGPRs);
2207 if (!HasArchitectedSGPRs)
2208 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2213 if (
Info.hasWorkGroupIDY()) {
2214 Register Reg =
Info.addWorkGroupIDY(HasArchitectedSGPRs);
2215 if (!HasArchitectedSGPRs)
2216 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2221 if (
Info.hasWorkGroupIDZ()) {
2222 Register Reg =
Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2223 if (!HasArchitectedSGPRs)
2224 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2229 if (
Info.hasWorkGroupInfo()) {
2231 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2235 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2237 unsigned PrivateSegmentWaveByteOffsetReg;
2240 PrivateSegmentWaveByteOffsetReg =
2241 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2245 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2247 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2250 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2252 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2253 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2257 Info.getNumPreloadedSGPRs() >= 16);
2272 if (HasStackObjects)
2273 Info.setHasNonSpillStackObjects(
true);
2278 HasStackObjects =
true;
2282 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2284 if (!ST.enableFlatScratch()) {
2285 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2292 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2294 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2304 Info.setScratchRSrcReg(ReservedBufferReg);
2323 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2324 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2331 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2332 if (!
MRI.isLiveIn(Reg)) {
2333 Info.setStackPtrOffsetReg(Reg);
2338 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2345 if (ST.getFrameLowering()->hasFP(MF)) {
2346 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2352 return !
Info->isEntryFunction();
2364 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2373 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2374 RC = &AMDGPU::SGPR_64RegClass;
2375 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2376 RC = &AMDGPU::SGPR_32RegClass;
2382 Entry->addLiveIn(*
I);
2387 for (
auto *Exit : Exits)
2389 TII->get(TargetOpcode::COPY), *
I)
2407 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2412 Info->allocateKnownAddressLDSGlobal(Fn);
2425 assert(!
Info->hasDispatchPtr() && !
Info->hasKernargSegmentPtr() &&
2426 !
Info->hasWorkGroupInfo() && !
Info->hasLDSKernelId() &&
2427 !
Info->hasWorkItemIDX() && !
Info->hasWorkItemIDY() &&
2428 !
Info->hasWorkItemIDZ());
2433 !
Info->hasWorkGroupIDZ());
2452 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2453 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2456 Info->markPSInputAllocated(0);
2457 Info->markPSInputEnabled(0);
2468 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2469 if ((PsInputBits & 0x7F) == 0 ||
2470 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2473 }
else if (IsKernel) {
2476 Splits.
append(Ins.begin(), Ins.end());
2482 }
else if (!IsGraphics) {
2503 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2505 if (
Arg.isOrigArg() && Skipped[
Arg.getOrigArgIndex()]) {
2513 if (IsEntryFunc && VA.
isMemLoc()) {
2520 if (
Arg.Flags.isByRef()) {
2526 Arg.Flags.getPointerAddrSpace())) {
2528 Arg.Flags.getPointerAddrSpace());
2536 DAG, VT, MemVT,
DL, Chain,
Offset, Alignment, Ins[i].
Flags.isSExt(), &Ins[i]);
2540 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
2553 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
2554 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain,
Arg);
2556 if (!
Arg.Flags.isByVal())
2565 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
2566 RC = &AMDGPU::VGPR_32RegClass;
2567 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
2568 RC = &AMDGPU::SGPR_32RegClass;
2576 if (
Arg.Flags.isSRet()) {
2626 auto &ArgUsageInfo =
2631 Info->setBytesInStackArgArea(StackArgSize);
2633 return Chains.
empty() ? Chain :
2672 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
2690 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.
size();
I !=
E;
2691 ++
I, ++RealRVLocIdx) {
2723 if (!
Info->isEntryFunction()) {
2729 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2731 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2764 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
2830 auto &ArgUsageInfo =
2832 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2861 std::tie(OutgoingArg, ArgRC, ArgTy) =
2869 std::tie(IncomingArg, IncomingArgRC, Ty) =
2871 assert(IncomingArgRC == ArgRC);
2882 InputReg = getImplicitArgPtr(DAG,
DL);
2884 std::optional<uint32_t> Id =
2886 if (Id.has_value()) {
2898 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
2902 unsigned SpecialArgOffset =
2916 std::tie(OutgoingArg, ArgRC, Ty) =
2919 std::tie(OutgoingArg, ArgRC, Ty) =
2922 std::tie(OutgoingArg, ArgRC, Ty) =
2937 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
2938 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
2939 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
2956 InputReg = InputReg.
getNode() ?
2965 InputReg = InputReg.
getNode() ?
2969 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2970 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
2980 IncomingArgX ? *IncomingArgX :
2981 IncomingArgY ? *IncomingArgY :
2982 *IncomingArgZ, ~0u);
2989 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3027 if (
Callee->isDivergent())
3034 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3038 if (!CallerPreserved)
3041 bool CCMatch = CallerCC == CalleeCC;
3054 if (
Arg.hasByValAttr())
3068 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3069 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3078 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3116 bool IsSibCall =
false;
3117 bool IsThisReturn =
false;
3122 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
3131 "unsupported call to variadic function ");
3139 "unsupported required tail call to function ");
3146 "unsupported call to a shader function ");
3153 "unsupported calling convention for call from "
3154 "graphics shader of function ");
3159 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3162 "site marked musttail");
3169 if (!TailCallOpt && IsTailCall)
3220 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3229 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3265 int32_t
Offset = LocMemOffset;
3272 unsigned OpSize =
Flags.isByVal() ?
3278 ?
Flags.getNonZeroByValAlign()
3305 if (Outs[i].
Flags.isByVal()) {
3310 Outs[i].
Flags.getNonZeroByValAlign(),
3324 if (!MemOpChains.
empty())
3330 for (
auto &RegToPass : RegsToPass) {
3332 RegToPass.second, InFlag);
3341 if (IsTailCall && !IsSibCall) {
3346 std::vector<SDValue> Ops;
3347 Ops.push_back(Chain);
3367 for (
auto &RegToPass : RegsToPass) {
3369 RegToPass.second.getValueType()));
3375 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3376 assert(Mask &&
"Missing call preserved mask for calling convention");
3380 Ops.push_back(InFlag);
3393 Chain = Call.getValue(0);
3394 InFlag = Call.getValue(1);
3396 uint64_t CalleePopBytes = NumBytes;
3404 InVals, IsThisReturn,
3405 IsThisReturn ? OutVals[0] :
SDValue());
3416 EVT VT = Op.getValueType();
3418 SDValue Tmp2 = Op.getValue(1);
3419 SDValue Tmp3 = Op.getOperand(2);
3431 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3442 Align StackAlign = TFL->getStackAlign();
3443 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3444 if (Alignment && *Alignment > StackAlign) {
3447 << ST.getWavefrontSizeLog2(),
3465 if (isa<ConstantSDNode>(
Size))
3474 .
Case(
"m0", AMDGPU::M0)
3475 .
Case(
"exec", AMDGPU::EXEC)
3476 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
3477 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
3478 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
3479 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3480 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3483 if (Reg == AMDGPU::NoRegister) {
3497 case AMDGPU::EXEC_LO:
3498 case AMDGPU::EXEC_HI:
3499 case AMDGPU::FLAT_SCR_LO:
3500 case AMDGPU::FLAT_SCR_HI:
3505 case AMDGPU::FLAT_SCR:
3524 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
3533static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3555 auto Next = std::next(
I);
3568 return std::pair(LoopBB, RemainderBB);
3575 auto I =
MI.getIterator();
3576 auto E = std::next(
I);
3598 Src->setIsKill(
false);
3614 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3617 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
3639 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
3640 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
3649 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
3650 Register NewExec =
MRI.createVirtualRegister(BoolRC);
3651 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3652 Register CondReg =
MRI.createVirtualRegister(BoolRC);
3660 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
3667 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3671 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3676 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3677 : AMDGPU::S_AND_SAVEEXEC_B64),
3681 MRI.setSimpleHint(NewExec, CondReg);
3683 if (UseGPRIdxMode) {
3685 SGPRIdxReg = CurrentIdxReg;
3687 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3688 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3695 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3698 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3705 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3707 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3708 : AMDGPU::S_XOR_B64_term), Exec)
3729 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
3730 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
3738 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3740 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
3741 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
3742 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3743 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3758 InitResultReg, DstReg, PhiReg, TmpExec,
3759 Offset, UseGPRIdxMode, SGPRIdxReg);
3769 BuildMI(*LandingPad, First,
DL,
TII->get(MovExecOpc), Exec)
3776static std::pair<unsigned, int>
3781 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
3786 return std::pair(AMDGPU::sub0,
Offset);
3800 assert(
Idx->getReg() != AMDGPU::NoRegister);
3821 return Idx->getReg();
3823 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3840 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
3841 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3850 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3853 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3857 if (UseGPRIdxMode) {
3864 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
3877 MI.eraseFromParent();
3886 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3887 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3893 UseGPRIdxMode, SGPRIdxReg);
3897 if (UseGPRIdxMode) {
3899 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
3901 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
3906 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3911 MI.eraseFromParent();
3928 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3939 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3941 if (
Idx->getReg() == AMDGPU::NoRegister) {
3952 MI.eraseFromParent();
3957 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3961 if (UseGPRIdxMode) {
3965 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
3974 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
3975 TRI.getRegSizeInBits(*VecRC), 32,
false);