28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
35 #define DEBUG_TYPE "si-lower"
37 STATISTIC(NumTailCalls,
"Number of tail calls");
40 "amdgpu-disable-loop-alignment",
41 cl::desc(
"Do not align and prefetch loops"),
45 "amdgpu-reserve-vgpr-for-sgpr-spill",
49 "amdgpu-use-divergent-register-indexing",
51 cl::desc(
"Use indirect register addressing for divergent indexes"),
56 return Info->getMode().allFP32Denormals();
61 return Info->getMode().allFP64FP16Denormals();
65 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
68 return AMDGPU::SGPR0 +
Reg;
416 if (!Subtarget->
hasBFI()) {
867 EVT DestVT,
EVT SrcVT)
const {
921 return (NumElts + 1) / 2;
927 return NumElts * ((
Size + 31) / 32);
936 EVT VT,
EVT &IntermediateVT,
937 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
947 IntermediateVT = RegisterVT;
948 NumIntermediates = (NumElts + 1) / 2;
949 return NumIntermediates;
954 IntermediateVT = RegisterVT;
955 NumIntermediates = NumElts;
956 return NumIntermediates;
959 if (Size < 16 && Subtarget->has16BitInsts()) {
962 IntermediateVT = ScalarVT;
963 NumIntermediates = NumElts;
964 return NumIntermediates;
970 IntermediateVT = ScalarVT;
971 NumIntermediates = NumElts;
972 return NumIntermediates;
977 IntermediateVT = RegisterVT;
978 NumIntermediates = NumElts * ((
Size + 31) / 32);
979 return NumIntermediates;
984 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
990 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
991 unsigned NumElts =
std::min(DMaskLanes, VT->getNumElements());
1002 auto *
ST = dyn_cast<StructType>(Ty);
1010 if (
ST->getNumContainedTypes() != 2 ||
1011 !
ST->getContainedType(1)->isIntegerTy(32))
1019 unsigned IntrID)
const {
1029 if (RsrcIntr->IsImage) {
1040 unsigned DMaskLanes = 4;
1042 if (RsrcIntr->IsImage) {
1067 if (RsrcIntr->IsImage) {
1068 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1091 case Intrinsic::amdgcn_atomic_inc:
1092 case Intrinsic::amdgcn_atomic_dec:
1093 case Intrinsic::amdgcn_ds_ordered_add:
1094 case Intrinsic::amdgcn_ds_ordered_swap:
1095 case Intrinsic::amdgcn_ds_fadd:
1096 case Intrinsic::amdgcn_ds_fmin:
1097 case Intrinsic::amdgcn_ds_fmax: {
1110 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1121 if (!Vol || !Vol->
isZero())
1126 case Intrinsic::amdgcn_ds_append:
1127 case Intrinsic::amdgcn_ds_consume: {
1140 case Intrinsic::amdgcn_global_atomic_csub: {
1150 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1161 case Intrinsic::amdgcn_global_atomic_fadd:
1162 case Intrinsic::amdgcn_global_atomic_fmin:
1163 case Intrinsic::amdgcn_global_atomic_fmax:
1164 case Intrinsic::amdgcn_flat_atomic_fadd:
1165 case Intrinsic::amdgcn_flat_atomic_fmin:
1166 case Intrinsic::amdgcn_flat_atomic_fmax: {
1177 case Intrinsic::amdgcn_ds_gws_init:
1178 case Intrinsic::amdgcn_ds_gws_barrier:
1179 case Intrinsic::amdgcn_ds_gws_sema_v:
1180 case Intrinsic::amdgcn_ds_gws_sema_br:
1181 case Intrinsic::amdgcn_ds_gws_sema_p:
1182 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1195 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1206 Type *&AccessTy)
const {
1208 case Intrinsic::amdgcn_atomic_inc:
1209 case Intrinsic::amdgcn_atomic_dec:
1210 case Intrinsic::amdgcn_ds_ordered_add:
1211 case Intrinsic::amdgcn_ds_ordered_swap:
1212 case Intrinsic::amdgcn_ds_append:
1213 case Intrinsic::amdgcn_ds_consume:
1214 case Intrinsic::amdgcn_ds_fadd:
1215 case Intrinsic::amdgcn_ds_fmin:
1216 case Intrinsic::amdgcn_ds_fmax:
1217 case Intrinsic::amdgcn_global_atomic_fadd:
1218 case Intrinsic::amdgcn_flat_atomic_fadd:
1219 case Intrinsic::amdgcn_flat_atomic_fmin:
1220 case Intrinsic::amdgcn_flat_atomic_fmax:
1221 case Intrinsic::amdgcn_global_atomic_csub: {
1232 bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM)
const {
1236 return AM.BaseOffs == 0 && AM.Scale == 0;
1239 return AM.Scale == 0 &&
1240 (AM.BaseOffs == 0 ||
1247 return AM.
Scale == 0 &&
1262 return isLegalFlatAddressingMode(AM);
1265 return isLegalMUBUFAddressingMode(AM);
1268 bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1290 if (AM.HasBaseReg) {
1320 return isLegalMUBUFAddressingMode(AM);
1326 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1354 return isLegalMUBUFAddressingMode(AM);
1378 return isLegalFlatAddressingMode(AM);
1399 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1411 *IsFast = Alignment !=
Align(2);
1424 bool AlignedBy4 = Alignment >=
Align(4);
1426 *IsFast = AlignedBy4;
1433 bool AlignedBy16 = Alignment >=
Align(16);
1435 *IsFast = AlignedBy16;
1443 bool AlignedBy8 = Alignment >=
Align(8);
1445 *IsFast = AlignedBy8;
1452 bool AlignedBy4 = Alignment >=
Align(4);
1454 *IsFast = AlignedBy4;
1456 return AlignedBy4 ||
1466 bool AlignedBy4 = Alignment >=
Align(4);
1468 *IsFast = AlignedBy4;
1483 Alignment >=
Align(4) : Alignment !=
Align(2);
1499 return Size >= 32 && Alignment >=
Align(4);
1504 bool *IsFast)
const {
1518 Alignment, Flags, IsFast);
1528 if (
Op.size() >= 16 &&
1532 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1540 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1542 const Instruction *
I = dyn_cast_or_null<Instruction>(Ptr);
1543 return I &&
I->getMetadata(
"amdgpu.noclobber");
1552 unsigned DestAS)
const {
1560 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1564 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1621 std::tie(InputPtrReg, RC, ArgTy) =
1633 const SDLoc &SL)
const {
1654 if (
Arg && (
Arg->Flags.isSExt() ||
Arg->Flags.isZExt()) &&
1661 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1670 SDValue SITargetLowering::lowerKernargMemParameter(
1682 int64_t OffsetDiff =
Offset - AlignDownOffset;
1688 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1698 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed,
Arg);
1704 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
1719 if (
Arg.Flags.isByVal()) {
1720 unsigned Size =
Arg.Flags.getByValSize();
1756 ExtType, SL, VA.
getLocVT(), Chain, FIN,
1779 for (
unsigned I = 0,
E =
Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
1782 assert((!
Arg->VT.isVector() ||
Arg->VT.getScalarSizeInBits() == 16) &&
1783 "vector type argument should have been split");
1787 !
Arg->Flags.isInReg() && PSInputNum <= 15) {
1788 bool SkipArg = !
Arg->Used && !
Info->isPSInputAllocated(PSInputNum);
1793 if (
Arg->Flags.isSplit()) {
1794 while (!
Arg->Flags.isSplitEnd()) {
1796 Arg->VT.getScalarSizeInBits() == 16) &&
1797 "unexpected vector split in ps argument type");
1799 Splits.push_back(*
Arg);
1806 Skipped.
set(
Arg->getOrigArgIndex());
1811 Info->markPSInputAllocated(PSInputNum);
1813 Info->markPSInputEnabled(PSInputNum);
1818 Splits.push_back(*
Arg);
1830 if (
Info.hasWorkItemIDX()) {
1836 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1840 if (
Info.hasWorkItemIDY()) {
1846 unsigned Reg = AMDGPU::VGPR1;
1854 if (
Info.hasWorkItemIDZ()) {
1860 unsigned Reg = AMDGPU::VGPR2;
1881 if (RegIdx == ArgVGPRs.
size()) {
1888 unsigned Reg = ArgVGPRs[RegIdx];
1900 unsigned NumArgRegs) {
1903 if (RegIdx == ArgSGPRs.
size())
1906 unsigned Reg = ArgSGPRs[RegIdx];
1948 const unsigned Mask = 0x3ff;
1951 if (
Info.hasWorkItemIDX()) {
1956 if (
Info.hasWorkItemIDY()) {
1961 if (
Info.hasWorkItemIDZ())
1973 const unsigned Mask = 0x3ff;
1984 auto &ArgInfo =
Info.getArgInfo();
1988 if (
Info.hasDispatchPtr())
1991 if (
Info.hasQueuePtr())
1996 if (
Info.hasImplicitArgPtr())
1999 if (
Info.hasDispatchID())
2004 if (
Info.hasWorkGroupIDX())
2007 if (
Info.hasWorkGroupIDY())
2010 if (
Info.hasWorkGroupIDZ())
2019 if (
Info.hasImplicitBufferPtr()) {
2021 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2026 if (
Info.hasPrivateSegmentBuffer()) {
2028 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2032 if (
Info.hasDispatchPtr()) {
2034 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2038 if (
Info.hasQueuePtr()) {
2040 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2044 if (
Info.hasKernargSegmentPtr()) {
2053 if (
Info.hasDispatchID()) {
2055 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2061 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2074 bool IsShader)
const {
2075 if (
Info.hasWorkGroupIDX()) {
2081 if (
Info.hasWorkGroupIDY()) {
2087 if (
Info.hasWorkGroupIDZ()) {
2093 if (
Info.hasWorkGroupInfo()) {
2099 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2101 unsigned PrivateSegmentWaveByteOffsetReg;
2104 PrivateSegmentWaveByteOffsetReg =
2105 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2109 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2111 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2114 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2116 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2117 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2133 if (HasStackObjects)
2134 Info.setHasNonSpillStackObjects(
true);
2139 HasStackObjects =
true;
2143 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2145 if (!
ST.enableFlatScratch()) {
2146 if (RequiresStackAccess &&
ST.isAmdHsaOrMesa(MF.
getFunction())) {
2153 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2155 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2165 Info.setScratchRSrcReg(ReservedBufferReg);
2185 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2192 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2194 Info.setStackPtrOffsetReg(
Reg);
2199 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2206 if (
ST.getFrameLowering()->hasFP(MF)) {
2207 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2213 return !
Info->isEntryFunction();
2225 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2234 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2235 RC = &AMDGPU::SGPR_64RegClass;
2236 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2237 RC = &AMDGPU::SGPR_32RegClass;
2243 Entry->addLiveIn(*
I);
2248 for (
auto *Exit : Exits)
2250 TII->get(TargetOpcode::COPY), *
I)
2268 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2286 assert(!
Info->hasDispatchPtr() && !
Info->hasKernargSegmentPtr() &&
2288 !
Info->hasWorkGroupIDX() && !
Info->hasWorkGroupIDY() &&
2289 !
Info->hasWorkGroupIDZ() && !
Info->hasWorkGroupInfo() &&
2290 !
Info->hasWorkItemIDX() && !
Info->hasWorkItemIDY() &&
2291 !
Info->hasWorkItemIDZ());
2310 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2311 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2314 Info->markPSInputAllocated(0);
2315 Info->markPSInputEnabled(0);
2326 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2327 if ((PsInputBits & 0x7F) == 0 ||
2328 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2329 Info->markPSInputEnabled(
2332 }
else if (IsKernel) {
2363 for (
unsigned i = 0,
e =
Ins.size(), ArgIdx = 0;
i !=
e; ++
i) {
2365 if (
Arg.isOrigArg() && Skipped[
Arg.getOrigArgIndex()]) {
2373 if (IsEntryFunc && VA.
isMemLoc()) {
2380 if (
Arg.Flags.isByRef()) {
2386 Arg.Flags.getPointerAddrSpace())) {
2388 Arg.Flags.getPointerAddrSpace());
2391 InVals.push_back(Ptr);
2396 DAG, VT, MemVT,
DL, Chain,
Offset, Alignment,
Ins[
i].Flags.isSExt(), &
Ins[
i]);
2397 Chains.push_back(
Arg.getValue(1));
2411 InVals.push_back(
Arg);
2413 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
2414 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain,
Arg);
2415 InVals.push_back(Val);
2416 if (!
Arg.Flags.isByVal())
2430 if (
Arg.Flags.isSRet()) {
2468 InVals.push_back(Val);
2484 auto &ArgUsageInfo =
2489 Info->setBytesInStackArgArea(StackArgSize);
2491 return Chains.empty() ? Chain :
2529 Info->setIfReturnsVoid(Outs.empty());
2530 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
2545 RetOps.push_back(Chain);
2548 if (!
Info->isEntryFunction()) {
2551 DAG, &AMDGPU::SReg_64RegClass,
TRI->getReturnAddressReg(MF),
MVT::i64);
2559 RetOps.push_back(ReturnAddrVirtualReg);
2563 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.size();
I !=
E;
2564 ++
I, ++RealRVLocIdx) {
2596 if (!
Info->isEntryFunction()) {
2602 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2604 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2615 RetOps.push_back(
Flag);
2637 for (
unsigned i = 0;
i != RVLocs.size(); ++
i) {
2673 InVals.push_back(Val);
2702 auto &ArgUsageInfo =
2704 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2720 for (
auto InputID : InputRegs) {
2725 std::tie(OutgoingArg, ArgRC, ArgTy) =
2733 std::tie(IncomingArg, IncomingArgRC, Ty) =
2735 assert(IncomingArgRC == ArgRC);
2747 InputReg = getImplicitArgPtr(DAG,
DL);
2751 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
2755 unsigned SpecialArgOffset =
2759 MemOpChains.push_back(ArgStore);
2769 std::tie(OutgoingArg, ArgRC, Ty) =
2772 std::tie(OutgoingArg, ArgRC, Ty) =
2775 std::tie(OutgoingArg, ArgRC, Ty) =
2798 InputReg = InputReg.
getNode() ?
2806 InputReg = InputReg.
getNode() ?
2814 IncomingArgX ? *IncomingArgX :
2815 IncomingArgY ? *IncomingArgY :
2816 *IncomingArgZ, ~0u);
2821 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
2827 MemOpChains.push_back(ArgStore);
2862 if (!CallerPreserved)
2865 bool CCMatch = CallerCC == CalleeCC;
2878 if (
Arg.hasByValAttr())
2902 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2940 bool IsSibCall =
false;
2941 bool IsThisReturn =
false;
2946 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
2955 "unsupported call to variadic function ");
2963 "unsupported required tail call to function ");
2970 "unsupported call to a shader function ");
2977 "unsupported calling convention for call from "
2978 "graphics shader of function ");
2983 Callee, CallConv, IsVarArg, Outs, OutVals,
Ins, DAG);
2986 "site marked musttail");
2993 if (!TailCallOpt && IsTailCall)
3045 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3046 CopyFromChains.push_back(ScratchRSrcReg.
getValue(1));
3054 for (
unsigned i = 0,
e = ArgLocs.size();
i !=
e; ++
i) {
3082 RegsToPass.push_back(std::make_pair(VA.
getLocReg(),
Arg));
3090 int32_t
Offset = LocMemOffset;
3097 unsigned OpSize = Flags.
isByVal() ?
3127 if (Outs[
i].Flags.isByVal()) {
3132 Outs[
i].Flags.getNonZeroByValAlign(),
3137 MemOpChains.push_back(Cpy);
3141 MemOpChains.push_back(
Store);
3152 if (!MemOpChains.empty())
3158 for (
auto &RegToPass : RegsToPass) {
3160 RegToPass.second, InFlag);
3172 DAG, &AMDGPU::SReg_64RegClass,
TRI->getReturnAddressReg(MF),
MVT::i64);
3174 PhysReturnAddrReg = DAG.
getRegister(
TRI->getReturnAddressReg(MF),
3176 Chain = DAG.
getCopyToReg(Chain,
DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
3184 if (IsTailCall && !IsSibCall) {
3192 std::vector<SDValue> Ops;
3193 Ops.push_back(Chain);
3210 Ops.push_back(PhysReturnAddrReg);
3215 for (
auto &RegToPass : RegsToPass) {
3217 RegToPass.second.getValueType()));
3224 assert(
Mask &&
"Missing call preserved mask for calling convention");
3228 Ops.push_back(InFlag);
3241 Chain = Call.getValue(0);
3242 InFlag = Call.getValue(1);
3244 uint64_t CalleePopBytes = NumBytes;
3254 InVals, IsThisReturn,
3255 IsThisReturn ? OutVals[0] :
SDValue());
3266 EVT VT =
Op.getValueType();
3281 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3293 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3297 <<
ST.getWavefrontSizeLog2(),
3317 if (isa<ConstantSDNode>(
Size))
3327 .
Case(
"exec", AMDGPU::EXEC)
3328 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
3329 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
3330 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
3331 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3332 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3335 if (
Reg == AMDGPU::NoRegister) {
3349 case AMDGPU::EXEC_LO:
3350 case AMDGPU::EXEC_HI:
3351 case AMDGPU::FLAT_SCR_LO:
3352 case AMDGPU::FLAT_SCR_HI:
3357 case AMDGPU::FLAT_SCR:
3376 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
3385 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3407 auto Next = std::next(
I);
3420 return std::make_pair(LoopBB, RemainderBB);
3427 auto I =
MI.getIterator();
3428 auto E = std::next(
I);
3450 Src->setIsKill(
false);
3491 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
3492 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
3512 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
3519 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3523 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3528 BuildMI(LoopBB,
I,
DL,
TII->get(
ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3529 : AMDGPU::S_AND_SAVEEXEC_B64),
3535 if (UseGPRIdxMode) {
3537 SGPRIdxReg = CurrentIdxReg;
3540 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3557 unsigned Exec =
ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3559 BuildMI(LoopBB,
I,
DL,
TII->get(
ST.isWave32() ? AMDGPU::S_XOR_B32_term
3560 : AMDGPU::S_XOR_B64_term), Exec)
3581 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
3582 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
3590 const auto *BoolXExecRC =
TRI->
getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3594 unsigned Exec =
ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3595 unsigned MovExecOpc =
ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3610 InitResultReg, DstReg, PhiReg, TmpExec,
3611 Offset, UseGPRIdxMode, SGPRIdxReg);
3628 static std::pair<unsigned, int>
3638 return std::make_pair(AMDGPU::sub0,
Offset);
3692 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
3693 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3702 const bool UseGPRIdxMode =
ST.useVGPRIndexMode();
3705 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3709 if (UseGPRIdxMode) {
3729 MI.eraseFromParent();
3745 UseGPRIdxMode, SGPRIdxReg);
3749 if (UseGPRIdxMode) {
3753 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
3758 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3763 MI.eraseFromParent();
3780 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3791 const bool UseGPRIdxMode =
ST.useVGPRIndexMode();
3793 if (Idx->
getReg() == AMDGPU::NoRegister) {
3804 MI.eraseFromParent();
3809 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3813 if (UseGPRIdxMode) {
3826 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
3833 MI.eraseFromParent();
3847 UseGPRIdxMode, SGPRIdxReg);
3850 if (UseGPRIdxMode) {
3854 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
3860 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
3862 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
3868 MI.eraseFromParent();
3879 switch (
MI.getOpcode()) {
3880 case AMDGPU::S_UADDO_PSEUDO:
3881 case AMDGPU::S_USUBO_PSEUDO: {
3888 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3890 : AMDGPU::S_SUB_I32;
3897 MI.eraseFromParent();
3900 case AMDGPU::S_ADD_U64_PSEUDO:
3901 case AMDGPU::S_SUB_U64_PSEUDO: {
3916 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3918 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3921 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3923 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3925 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3927 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3928 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3936 MI.eraseFromParent();
3939 case AMDGPU::V_ADD_U64_PSEUDO:
3940 case AMDGPU::V_SUB_U64_PSEUDO: {
3946 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
3948 const auto *CarryRC =
TRI->
getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3962 : &AMDGPU::VReg_64RegClass;
3965 : &AMDGPU::VReg_64RegClass;
3968 TRI->getSubRegClass(Src0RC, AMDGPU::sub0);
3970 TRI->getSubRegClass(Src1RC, AMDGPU::sub1);
3973 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
3975 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
3978 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
3980 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
3982 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
3989 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;