40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
277 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
284 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
285 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
286 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
289 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
290 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
291 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
295 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
296 MVT::v3i16, MVT::v4i16, MVT::Other},
301 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
317 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
318 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
319 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
320 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
321 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
322 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
323 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
324 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
356 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
370 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
384 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
398 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
412 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
427 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
436 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
437 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
442 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
446 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
530 {MVT::f32, MVT::f64},
Legal);
623 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
624 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
625 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
750 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
769 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
770 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
792 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
793 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
796 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
804 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
820 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
840 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
841 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
842 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
843 MVT::v32f16, MVT::v32bf16},
859 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
867 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
868 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
869 MVT::i16, MVT::i8, MVT::i128},
873 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
874 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
969 EVT DestVT,
EVT SrcVT)
const {
979 LLT DestTy,
LLT SrcTy)
const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1009 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1036 return (NumElts + 1) / 2;
1042 return NumElts * ((
Size + 31) / 32);
1051 EVT VT,
EVT &IntermediateVT,
1052 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1065 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((
Size + 31) / 32);
1099 return NumIntermediates;
1104 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1108 assert(MaxNumLanes != 0);
1110 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1122 auto *ST = dyn_cast<StructType>(Ty);
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1143 DL.getPointerSizeInBits(AS) == 192)
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1163 unsigned IntrID)
const {
1165 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1179 if (RsrcIntr->IsImage)
1183 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1190 Info.ptrVal = RsrcArg;
1198 unsigned MaxNumLanes = 4;
1200 if (RsrcIntr->IsImage) {
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1286 if (!Vol || !Vol->
isZero())
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1295 Info.ptrVal =
nullptr;
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1358 case Intrinsic::amdgcn_global_load_tr_b64:
1359 case Intrinsic::amdgcn_global_load_tr_b128: {
1367 case Intrinsic::amdgcn_ds_gws_init:
1368 case Intrinsic::amdgcn_ds_gws_barrier:
1369 case Intrinsic::amdgcn_ds_gws_sema_v:
1370 case Intrinsic::amdgcn_ds_gws_sema_br:
1371 case Intrinsic::amdgcn_ds_gws_sema_p:
1372 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1382 Info.memVT = MVT::i32;
1386 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1392 case Intrinsic::amdgcn_global_load_lds: {
1394 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1400 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1410 Info.memVT = MVT::i32;
1425 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1428 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1429 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1441 Type *&AccessTy)
const {
1444 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1445 case Intrinsic::amdgcn_ds_append:
1446 case Intrinsic::amdgcn_ds_consume:
1447 case Intrinsic::amdgcn_ds_fadd:
1448 case Intrinsic::amdgcn_ds_fmax:
1449 case Intrinsic::amdgcn_ds_fmin:
1450 case Intrinsic::amdgcn_ds_ordered_add:
1451 case Intrinsic::amdgcn_ds_ordered_swap:
1452 case Intrinsic::amdgcn_flat_atomic_fadd:
1453 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1454 case Intrinsic::amdgcn_flat_atomic_fmax:
1455 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1456 case Intrinsic::amdgcn_flat_atomic_fmin:
1457 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1458 case Intrinsic::amdgcn_global_atomic_csub:
1459 case Intrinsic::amdgcn_global_atomic_fadd:
1460 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1461 case Intrinsic::amdgcn_global_atomic_fmax:
1462 case Intrinsic::amdgcn_global_atomic_fmax_num:
1463 case Intrinsic::amdgcn_global_atomic_fmin:
1464 case Intrinsic::amdgcn_global_atomic_fmin_num:
1465 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1466 case Intrinsic::amdgcn_global_load_tr_b64:
1467 case Intrinsic::amdgcn_global_load_tr_b128:
1470 case Intrinsic::amdgcn_global_load_lds:
1481bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM,
1487 return AM.BaseOffs == 0 && AM.Scale == 0;
1490 return AM.Scale == 0 &&
1492 AM.BaseOffs, AddrSpace, FlatVariant));
1514 return isLegalMUBUFAddressingMode(AM);
1517bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1528 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1540 if (AM.HasBaseReg) {
1571 return isLegalMUBUFAddressingMode(AM);
1578 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1619 : isLegalMUBUFAddressingMode(AM);
1667 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1681 Alignment < RequiredAlignment)
1702 RequiredAlignment =
Align(4);
1720 *IsFast = (Alignment >= RequiredAlignment) ? 64
1721 : (Alignment <
Align(4)) ? 32
1743 *IsFast = (Alignment >= RequiredAlignment) ? 96
1744 : (Alignment <
Align(4)) ? 32
1757 RequiredAlignment =
Align(8);
1768 *IsFast = (Alignment >= RequiredAlignment) ? 128
1769 : (Alignment <
Align(4)) ? 32
1786 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1788 return Alignment >= RequiredAlignment ||
1793 bool AlignedBy4 = Alignment >=
Align(4);
1795 *IsFast = AlignedBy4;
1797 return AlignedBy4 ||
1807 bool AlignedBy4 = Alignment >=
Align(4);
1809 *IsFast = AlignedBy4;
1820 return Alignment >=
Align(4) ||
1834 return Size >= 32 && Alignment >=
Align(4);
1839 unsigned *IsFast)
const {
1841 Alignment, Flags, IsFast);
1851 if (
Op.size() >= 16 &&
1855 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1863 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1873 unsigned DestAS)
const {
1881 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1885 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1905 unsigned Index)
const {
1952 std::tie(InputPtrReg, RC, ArgTy) =
1962 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1968 const SDLoc &SL)
const {
1975 const SDLoc &SL)
const {
1978 std::optional<uint32_t> KnownSize =
1980 if (KnownSize.has_value())
2007 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2016SDValue SITargetLowering::lowerKernargMemParameter(
2028 int64_t OffsetDiff =
Offset - AlignDownOffset;
2034 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2044 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2055 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2102 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2130 Reg = &WorkGroupIDX;
2131 RC = &AMDGPU::SReg_32RegClass;
2135 Reg = &WorkGroupIDY;
2136 RC = &AMDGPU::SReg_32RegClass;
2140 Reg = &WorkGroupIDZ;
2141 RC = &AMDGPU::SReg_32RegClass;
2172 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2176 "vector type argument should have been split");
2181 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2190 "unexpected vector split in ps argument type");
2204 Info->markPSInputAllocated(PSInputNum);
2206 Info->markPSInputEnabled(PSInputNum);
2223 if (
Info.hasWorkItemIDX()) {
2229 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2233 if (
Info.hasWorkItemIDY()) {
2239 unsigned Reg = AMDGPU::VGPR1;
2247 if (
Info.hasWorkItemIDZ()) {
2253 unsigned Reg = AMDGPU::VGPR2;
2273 if (RegIdx == ArgVGPRs.
size()) {
2280 unsigned Reg = ArgVGPRs[RegIdx];
2282 assert(Reg != AMDGPU::NoRegister);
2292 unsigned NumArgRegs) {
2295 if (RegIdx == ArgSGPRs.
size())
2298 unsigned Reg = ArgSGPRs[RegIdx];
2300 assert(Reg != AMDGPU::NoRegister);
2314 assert(Reg != AMDGPU::NoRegister);
2340 const unsigned Mask = 0x3ff;
2343 if (
Info.hasWorkItemIDX()) {
2345 Info.setWorkItemIDX(Arg);
2348 if (
Info.hasWorkItemIDY()) {
2350 Info.setWorkItemIDY(Arg);
2353 if (
Info.hasWorkItemIDZ())
2365 const unsigned Mask = 0x3ff;
2390 if (
Info.hasImplicitArgPtr())
2398 if (
Info.hasWorkGroupIDX())
2401 if (
Info.hasWorkGroupIDY())
2404 if (
Info.hasWorkGroupIDZ())
2407 if (
Info.hasLDSKernelId())
2419 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2426 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2432 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2440 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2455 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2461 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2476 unsigned LastExplicitArgOffset =
2479 bool InPreloadSequence =
true;
2481 for (
auto &Arg :
F.args()) {
2482 if (!InPreloadSequence || !Arg.hasInRegAttr())
2485 int ArgIdx = Arg.getArgNo();
2488 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2489 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2492 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2493 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2495 assert(ArgLocs[ArgIdx].isMemLoc());
2496 auto &ArgLoc = ArgLocs[InIdx];
2498 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2500 unsigned NumAllocSGPRs =
2501 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2504 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2505 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2506 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2510 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2511 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2513 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2515 InPreloadSequence =
false;
2521 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2523 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2525 if (PreloadRegs->
size() > 1)
2526 RC = &AMDGPU::SGPR_32RegClass;
2527 for (
auto &Reg : *PreloadRegs) {
2533 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2542 if (
Info.hasLDSKernelId()) {
2544 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2554 bool IsShader)
const {
2562 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2564 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2568 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2569 Info.hasWorkGroupIDY() +
2570 Info.hasWorkGroupIDZ() +
2571 Info.hasWorkGroupInfo();
2572 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2574 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2579 if (!HasArchitectedSGPRs) {
2580 if (
Info.hasWorkGroupIDX()) {
2582 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2586 if (
Info.hasWorkGroupIDY()) {
2588 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2592 if (
Info.hasWorkGroupIDZ()) {
2594 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2599 if (
Info.hasWorkGroupInfo()) {
2601 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2605 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2607 unsigned PrivateSegmentWaveByteOffsetReg;
2610 PrivateSegmentWaveByteOffsetReg =
2611 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2615 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2617 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2620 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2622 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2623 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2627 Info.getNumPreloadedSGPRs() >= 16);
2642 if (HasStackObjects)
2643 Info.setHasNonSpillStackObjects(
true);
2648 HasStackObjects =
true;
2652 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2654 if (!ST.enableFlatScratch()) {
2655 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2662 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2664 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2674 Info.setScratchRSrcReg(ReservedBufferReg);
2693 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2694 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2701 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2702 if (!
MRI.isLiveIn(Reg)) {
2703 Info.setStackPtrOffsetReg(Reg);
2708 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2715 if (ST.getFrameLowering()->hasFP(MF)) {
2716 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2722 return !
Info->isEntryFunction();
2734 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2743 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2744 RC = &AMDGPU::SGPR_64RegClass;
2745 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2746 RC = &AMDGPU::SGPR_32RegClass;
2752 Entry->addLiveIn(*
I);
2757 for (
auto *Exit : Exits)
2759 TII->get(TargetOpcode::COPY), *
I)
2777 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2796 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2797 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2803 !
Info->hasWorkGroupIDZ());
2822 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2823 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2826 Info->markPSInputAllocated(0);
2827 Info->markPSInputEnabled(0);
2838 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2839 if ((PsInputBits & 0x7F) == 0 ||
2840 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2843 }
else if (IsKernel) {
2846 Splits.
append(Ins.begin(), Ins.end());
2859 }
else if (!IsGraphics) {
2884 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2894 if (IsEntryFunc && VA.
isMemLoc()) {
2917 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2921 int64_t OffsetDiff =
Offset - AlignDownOffset;
2928 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2939 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2940 Ins[i].Flags.isSExt(), &Ins[i]);
2948 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2951 if (PreloadRegs.
size() == 1) {
2952 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2957 TRI->getRegSizeInBits(*RC)));
2965 for (
auto Reg : PreloadRegs) {
2972 PreloadRegs.size()),
2981 NewArg = convertArgType(DAG, VT, MemVT,
DL, CMemVT,
2982 Ins[i].Flags.isSExt(), &Ins[i]);
2987 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
2988 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2993 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3006 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
3007 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3018 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3019 RC = &AMDGPU::VGPR_32RegClass;
3020 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3021 RC = &AMDGPU::SGPR_32RegClass;
3074 auto &ArgUsageInfo =
3079 Info->setBytesInStackArgArea(StackArgSize);
3081 return Chains.
empty() ? Chain :
3105 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3106 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3107 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3130 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3148 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3149 ++
I, ++RealRVLocIdx) {
3153 SDValue Arg = OutVals[RealRVLocIdx];
3181 if (!
Info->isEntryFunction()) {
3187 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3189 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3205 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3222 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
3288 auto &ArgUsageInfo =
3290 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3319 std::tie(OutgoingArg, ArgRC, ArgTy) =
3327 std::tie(IncomingArg, IncomingArgRC, Ty) =
3329 assert(IncomingArgRC == ArgRC);
3332 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3340 InputReg = getImplicitArgPtr(DAG,
DL);
3342 std::optional<uint32_t> Id =
3344 if (Id.has_value()) {
3356 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3360 unsigned SpecialArgOffset =
3374 std::tie(OutgoingArg, ArgRC, Ty) =
3377 std::tie(OutgoingArg, ArgRC, Ty) =
3380 std::tie(OutgoingArg, ArgRC, Ty) =
3395 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3396 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3397 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3414 InputReg = InputReg.
getNode() ?
3423 InputReg = InputReg.
getNode() ?
3427 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3428 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3438 IncomingArgX ? *IncomingArgX :
3439 IncomingArgY ? *IncomingArgY :
3440 *IncomingArgZ, ~0u);
3447 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3488 if (Callee->isDivergent())
3495 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3499 if (!CallerPreserved)
3502 bool CCMatch = CallerCC == CalleeCC;
3515 if (Arg.hasByValAttr())
3529 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3530 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3539 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3573 if (IsChainCallConv) {
3577 RequestedExec = CLI.
Args.back();
3578 assert(RequestedExec.
Node &&
"No node for EXEC");
3583 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3584 CLI.
Outs.pop_back();
3588 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3589 CLI.
Outs.pop_back();
3594 "Haven't popped all the pieces of the EXEC mask");
3605 bool IsSibCall =
false;
3610 for (
unsigned I = 0, E = CLI.
Ins.size();
I != E; ++
I)
3619 "unsupported call to variadic function ");
3627 "unsupported required tail call to function ");
3632 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3636 "site marked musttail or on llvm.amdgcn.cs.chain");
3643 if (!TailCallOpt && IsTailCall)
3688 if (!IsSibCall || IsChainCallConv) {
3695 RegsToPass.emplace_back(IsChainCallConv
3696 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3697 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3704 MVT PtrVT = MVT::i32;
3707 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3735 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3743 int32_t
Offset = LocMemOffset;
3750 unsigned OpSize = Flags.isByVal() ?
3756 ? Flags.getNonZeroByValAlign()
3783 if (Outs[i].Flags.isByVal()) {
3785 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3788 Outs[i].Flags.getNonZeroByValAlign(),
3796 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3802 if (!MemOpChains.
empty())
3808 for (
auto &RegToPass : RegsToPass) {
3810 RegToPass.second, InGlue);
3819 if (IsTailCall && !IsSibCall) {
3824 std::vector<SDValue> Ops;
3825 Ops.push_back(Chain);
3826 Ops.push_back(Callee);
3843 if (IsChainCallConv)
3844 Ops.push_back(RequestedExec.
Node);
3848 for (
auto &RegToPass : RegsToPass) {
3850 RegToPass.second.getValueType()));
3855 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3856 assert(Mask &&
"Missing call preserved mask for calling convention");
3860 Ops.push_back(InGlue);
3869 DL, MVT::Glue, Token),
3890 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3895 Chain = Call.getValue(0);
3896 InGlue = Call.getValue(1);
3898 uint64_t CalleePopBytes = NumBytes;
3917 EVT VT =
Op.getValueType();
3932 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3943 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3944 if (Alignment && *Alignment > StackAlign) {
3965 if (isa<ConstantSDNode>(
Size))
3972 if (
Op.getValueType() != MVT::i32)
3991 assert(
Op.getValueType() == MVT::i32);
4000 Op.getOperand(0), IntrinID, GetRoundBothImm);
4034 SDValue RoundModeTimesNumBits =
4054 TableEntry, EnumOffset);
4060 if (
Op->isDivergent())
4063 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4079 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4080 EVT SrcVT = Src.getValueType();
4089 EVT DstVT =
Op.getValueType();
4098 if (
Op.getValueType() != MVT::i64)
4112 Op.getOperand(0), IntrinID, ModeHwRegImm);
4114 Op.getOperand(0), IntrinID, TrapHwRegImm);
4128 if (
Op.getOperand(1).getValueType() != MVT::i64)
4140 ReadFirstLaneID, NewModeReg);
4142 ReadFirstLaneID, NewTrapReg);
4144 unsigned ModeHwReg =
4147 unsigned TrapHwReg =
4155 IntrinID, ModeHwRegImm, NewModeReg);
4158 IntrinID, TrapHwRegImm, NewTrapReg);
4165 .
Case(
"m0", AMDGPU::M0)
4166 .
Case(
"exec", AMDGPU::EXEC)
4167 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4168 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4169 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4170 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4171 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4174 if (Reg == AMDGPU::NoRegister) {
4188 case AMDGPU::EXEC_LO:
4189 case AMDGPU::EXEC_HI:
4190 case AMDGPU::FLAT_SCR_LO:
4191 case AMDGPU::FLAT_SCR_HI:
4196 case AMDGPU::FLAT_SCR:
4215 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4224static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4246 auto Next = std::next(
I);
4259 return std::pair(LoopBB, RemainderBB);
4266 auto I =
MI.getIterator();
4267 auto E = std::next(
I);
4289 Src->setIsKill(
false);
4305 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4308 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4330 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4331 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4340 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4341 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4342 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4343 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4351 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4358 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4362 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4367 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4368 : AMDGPU::S_AND_SAVEEXEC_B64),
4372 MRI.setSimpleHint(NewExec, CondReg);
4374 if (UseGPRIdxMode) {
4376 SGPRIdxReg = CurrentIdxReg;
4378 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4379 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4386 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4389 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4396 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4398 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4399 : AMDGPU::S_XOR_B64_term), Exec)
4420 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4421 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4429 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4431 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4432 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4433 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4434 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4449 InitResultReg, DstReg, PhiReg, TmpExec,
4450 Offset, UseGPRIdxMode, SGPRIdxReg);
4467static std::pair<unsigned, int>
4472 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4477 return std::pair(AMDGPU::sub0,
Offset);
4491 assert(
Idx->getReg() != AMDGPU::NoRegister);
4512 return Idx->getReg();
4514 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4531 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4532 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4541 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4544 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4548 if (UseGPRIdxMode) {
4555 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4568 MI.eraseFromParent();
4577 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4578 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4584 UseGPRIdxMode, SGPRIdxReg);
4588 if (UseGPRIdxMode) {
4590 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4592 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4597 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4602 MI.eraseFromParent();
4619 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4630 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4632 if (
Idx->getReg() == AMDGPU::NoRegister) {
4643 MI.eraseFromParent();
4648 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4652 if (UseGPRIdxMode) {
4656 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4665 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4666 TRI.getRegSizeInBits(*VecRC), 32,
false);
4672 MI.eraseFromParent();
4682 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4686 UseGPRIdxMode, SGPRIdxReg);
4689 if (UseGPRIdxMode) {
4691 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4693 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4699 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4700 TRI.getRegSizeInBits(*VecRC), 32,
false);
4701 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4707 MI.eraseFromParent();
4722 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4750 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4751 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4753 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4754 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4755 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4757 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4758 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4760 bool IsWave32 = ST.isWave32();
4761 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4762 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4767 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4770 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4775 I = ComputeLoop->end();
4777 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4781 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4782 .
addReg(TmpSReg->getOperand(0).getReg())
4786 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4787 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4788 .
addReg(ActiveBits->getOperand(0).getReg());
4789 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4790 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4792 .
addReg(FF1->getOperand(0).getReg());
4793 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4795 .
addReg(LaneValue->getOperand(0).getReg());
4798 unsigned BITSETOpc =
4799 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4800 auto NewActiveBits =
4801 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4802 .
addReg(FF1->getOperand(0).getReg())
4803 .
addReg(ActiveBits->getOperand(0).getReg());
4806 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4807 .addMBB(ComputeLoop);
4808 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4809 .addMBB(ComputeLoop);
4812 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4814 .
addReg(NewActiveBits->getOperand(0).getReg())
4816 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4821 MI.eraseFromParent();
4832 switch (
MI.getOpcode()) {
4833 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4835 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4837 case AMDGPU::S_UADDO_PSEUDO:
4838 case AMDGPU::S_USUBO_PSEUDO: {
4845 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4847 : AMDGPU::S_SUB_I32;
4854 MI.eraseFromParent();
4857 case AMDGPU::S_ADD_U64_PSEUDO:
4858 case AMDGPU::S_SUB_U64_PSEUDO: {
4867 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4869 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4877 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4878 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4881 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4883 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4886 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4888 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4890 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4891 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4904 MI.eraseFromParent();
4907 case AMDGPU::V_ADD_U64_PSEUDO:
4908 case AMDGPU::V_SUB_U64_PSEUDO: {
4914 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4920 if (IsAdd && ST.hasLshlAddB64()) {
4926 TII->legalizeOperands(*
Add);
4927 MI.eraseFromParent();
4931 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4933 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4934 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4936 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
4937 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
4941 : &AMDGPU::VReg_64RegClass;
4944 : &AMDGPU::VReg_64RegClass;
4947 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4949 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4952 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4954 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4957 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4959 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4961 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4968 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4982 TII->legalizeOperands(*LoHalf);
4983 TII->legalizeOperands(*HiHalf);
4984 MI.eraseFromParent();
4987 case AMDGPU::S_ADD_CO_PSEUDO:
4988 case AMDGPU::S_SUB_CO_PSEUDO: {
5002 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5003 ? AMDGPU::S_ADDC_U32
5004 : AMDGPU::S_SUBB_U32;
5006 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5007 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5012 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5013 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5017 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5019 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5025 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5026 assert(WaveSize == 64 || WaveSize == 32);
5028 if (WaveSize == 64) {
5029 if (ST.hasScalarCompareEq64()) {
5035 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5037 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5039 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5040 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5042 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5059 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5065 MI.eraseFromParent();
5068 case AMDGPU::SI_INIT_M0: {
5070 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5071 .
add(
MI.getOperand(0));
5072 MI.eraseFromParent();
5075 case AMDGPU::GET_GROUPSTATICSIZE: {
5080 .
add(
MI.getOperand(0))
5082 MI.eraseFromParent();
5085 case AMDGPU::GET_SHADERCYCLESHILO: {
5099 using namespace AMDGPU::Hwreg;
5100 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5102 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5103 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5105 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5106 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5108 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5112 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5117 .
add(
MI.getOperand(0))
5122 MI.eraseFromParent();
5125 case AMDGPU::SI_INDIRECT_SRC_V1:
5126 case AMDGPU::SI_INDIRECT_SRC_V2:
5127 case AMDGPU::SI_INDIRECT_SRC_V4:
5128 case AMDGPU::SI_INDIRECT_SRC_V8:
5129 case AMDGPU::SI_INDIRECT_SRC_V9:
5130 case AMDGPU::SI_INDIRECT_SRC_V10:
5131 case AMDGPU::SI_INDIRECT_SRC_V11:
5132 case AMDGPU::SI_INDIRECT_SRC_V12:
5133 case AMDGPU::SI_INDIRECT_SRC_V16:
5134 case AMDGPU::SI_INDIRECT_SRC_V32:
5136 case AMDGPU::SI_INDIRECT_DST_V1:
5137 case AMDGPU::SI_INDIRECT_DST_V2:
5138 case AMDGPU::SI_INDIRECT_DST_V4:
5139 case AMDGPU::SI_INDIRECT_DST_V8:
5140 case AMDGPU::SI_INDIRECT_DST_V9:
5141 case AMDGPU::SI_INDIRECT_DST_V10:
5142 case AMDGPU::SI_INDIRECT_DST_V11:
5143 case AMDGPU::SI_INDIRECT_DST_V12:
5144 case AMDGPU::SI_INDIRECT_DST_V16:
5145 case AMDGPU::SI_INDIRECT_DST_V32:
5147 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5148 case AMDGPU::SI_KILL_I1_PSEUDO:
5150 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5159 Register SrcCond =
MI.getOperand(3).getReg();
5161 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5162 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5163 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5164 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5168 : &AMDGPU::VReg_64RegClass;
5171 : &AMDGPU::VReg_64RegClass;
5174 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5176 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5179 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5181 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5184 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5186 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5208 MI.eraseFromParent();
5211 case AMDGPU::SI_BR_UNDEF: {
5215 .
add(
MI.getOperand(0));
5217 MI.eraseFromParent();
5220 case AMDGPU::ADJCALLSTACKUP:
5221 case AMDGPU::ADJCALLSTACKDOWN: {
5228 case AMDGPU::SI_CALL_ISEL: {
5232 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5235 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5241 MI.eraseFromParent();
5244 case AMDGPU::V_ADD_CO_U32_e32:
5245 case AMDGPU::V_SUB_CO_U32_e32:
5246 case AMDGPU::V_SUBREV_CO_U32_e32: {
5249 unsigned Opc =
MI.getOpcode();
5251 bool NeedClampOperand =
false;
5252 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5254 NeedClampOperand =
true;
5258 if (
TII->isVOP3(*
I)) {
5263 I.add(
MI.getOperand(1))
5264 .add(
MI.getOperand(2));
5265 if (NeedClampOperand)
5268 TII->legalizeOperands(*
I);
5270 MI.eraseFromParent();
5273 case AMDGPU::V_ADDC_U32_e32:
5274 case AMDGPU::V_SUBB_U32_e32:
5275 case AMDGPU::V_SUBBREV_U32_e32:
5278 TII->legalizeOperands(
MI);
5280 case AMDGPU::DS_GWS_INIT:
5281 case AMDGPU::DS_GWS_SEMA_BR:
5282 case AMDGPU::DS_GWS_BARRIER:
5283 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5285 case AMDGPU::DS_GWS_SEMA_V:
5286 case AMDGPU::DS_GWS_SEMA_P:
5287 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5295 case AMDGPU::S_SETREG_B32: {
5310 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5311 const unsigned SetMask = WidthMask <<
Offset;
5314 unsigned SetDenormOp = 0;
5315 unsigned SetRoundOp = 0;
5323 SetRoundOp = AMDGPU::S_ROUND_MODE;
5324 SetDenormOp = AMDGPU::S_DENORM_MODE;
5326 SetRoundOp = AMDGPU::S_ROUND_MODE;
5328 SetDenormOp = AMDGPU::S_DENORM_MODE;
5331 if (SetRoundOp || SetDenormOp) {
5334 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5335 unsigned ImmVal = Def->getOperand(1).getImm();
5349 MI.eraseFromParent();
5358 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5362 case AMDGPU::S_INVERSE_BALLOT_U32:
5363 case AMDGPU::S_INVERSE_BALLOT_U64: {
5368 const Register DstReg =
MI.getOperand(0).getReg();
5369 Register MaskReg =
MI.getOperand(1).getReg();
5371 const bool IsVALU =
TRI->isVectorRegister(
MRI, MaskReg);
5374 MaskReg =
TII->readlaneVGPRToSGPR(MaskReg,
MI,
MRI);
5378 MI.eraseFromParent();
5381 case AMDGPU::ENDPGM_TRAP: {
5384 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5402 MI.eraseFromParent();
5439 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5526 EVT VT =
N->getValueType(0);
5530 if (VT == MVT::f16) {
5546 unsigned Opc =
Op.getOpcode();
5547 EVT VT =
Op.getValueType();
5548 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5549 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5550 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5551 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5569 unsigned Opc =
Op.getOpcode();
5570 EVT VT =
Op.getValueType();
5571 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5572 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5573 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5574 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5593 unsigned Opc =
Op.getOpcode();
5594 EVT VT =
Op.getValueType();
5595 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5596 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5597 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5598 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5599 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5600 VT == MVT::v32bf16);
5606 : std::pair(Op0, Op0);
5625 switch (
Op.getOpcode()) {
5631 assert((!Result.getNode() ||
5632 Result.getNode()->getNumValues() == 2) &&
5633 "Load should return a value and a chain");
5637 EVT VT =
Op.getValueType();
5639 return lowerFSQRTF32(
Op, DAG);
5641 return lowerFSQRTF64(
Op, DAG);
5646 return LowerTrig(
Op, DAG);
5655 return LowerGlobalAddress(MFI,
Op, DAG);
5662 return lowerINSERT_SUBVECTOR(
Op, DAG);
5664 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5666 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5668 return lowerVECTOR_SHUFFLE(
Op, DAG);
5670 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5672 return lowerBUILD_VECTOR(
Op, DAG);
5675 return lowerFP_ROUND(
Op, DAG);
5680 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
5684 int RoundMode =
Op.getConstantOperandVal(1);
5692 return DAG.
getNode(Opc,
DL,
Op.getNode()->getVTList(),
Op->getOperand(0));
5695 return lowerTRAP(
Op, DAG);
5697 return lowerDEBUGTRAP(
Op, DAG);
5705 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5708 return lowerFLDEXP(
Op, DAG);
5733 return lowerMUL(
Op, DAG);
5736 return lowerXMULO(
Op, DAG);
5739 return lowerXMUL_LOHI(
Op, DAG);
5770 EVT FittingLoadVT = LoadVT;
5802SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5806 bool IsIntrinsic)
const {
5810 EVT LoadVT =
M->getValueType(0);
5812 EVT EquivLoadVT = LoadVT;
5831 VTList, Ops,
M->getMemoryVT(),
5832 M->getMemOperand());
5843 EVT LoadVT =
M->getValueType(0);
5849 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5850 bool IsTFE =
M->getNumValues() == 3;
5869 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand());
5872 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
5873 M->getMemOperand(), DAG);
5878 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
5879 M->getMemOperand(), DAG);
5887 EVT VT =
N->getValueType(0);
5888 unsigned CondCode =
N->getConstantOperandVal(3);
5899 EVT CmpVT =
LHS.getValueType();
5900 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
5921 EVT VT =
N->getValueType(0);
5923 unsigned CondCode =
N->getConstantOperandVal(3);
5932 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
5950 EVT VT =
N->getValueType(0);
5957 Src.getOperand(1), Src.getOperand(2));
5968 Exec = AMDGPU::EXEC_LO;
5970 Exec = AMDGPU::EXEC;
5988 switch (
N->getOpcode()) {
6000 unsigned IID =
N->getConstantOperandVal(0);
6002 case Intrinsic::amdgcn_make_buffer_rsrc:
6003 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6005 case Intrinsic::amdgcn_cvt_pkrtz: {
6014 case Intrinsic::amdgcn_cvt_pknorm_i16:
6015 case Intrinsic::amdgcn_cvt_pknorm_u16:
6016 case Intrinsic::amdgcn_cvt_pk_i16:
6017 case Intrinsic::amdgcn_cvt_pk_u16: {
6023 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6025 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6027 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6032 EVT VT =
N->getValueType(0);
6041 case Intrinsic::amdgcn_s_buffer_load: {
6053 EVT VT =
Op.getValueType();
6054 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6066 if (!
Offset->isDivergent()) {
6085 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6097 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6098 Results.push_back(Res.getOperand(
I));
6102 Results.push_back(Res.getValue(1));
6111 EVT VT =
N->getValueType(0);
6116 EVT SelectVT = NewVT;
6117 if (NewVT.
bitsLT(MVT::i32)) {
6120 SelectVT = MVT::i32;
6126 if (NewVT != SelectVT)
6132 if (
N->getValueType(0) != MVT::v2f16)
6145 if (
N->getValueType(0) != MVT::v2f16)
6158 if (
N->getValueType(0) != MVT::f16)
6176 if (
I.getUse().get() !=
Value)
6179 if (
I->getOpcode() == Opcode)
6185unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6187 switch (
Intr->getConstantOperandVal(1)) {
6188 case Intrinsic::amdgcn_if:
6190 case Intrinsic::amdgcn_else:
6192 case Intrinsic::amdgcn_loop:
6194 case Intrinsic::amdgcn_end_cf:
6242 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6255 assert(BR &&
"brcond missing unconditional branch user");
6256 Target = BR->getOperand(1);
6259 unsigned CFNode = isCFIntrinsic(
Intr);
6278 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6308 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6325 Intr->getOperand(0));
6332 MVT VT =
Op.getSimpleValueType();
6335 if (
Op.getConstantOperandVal(0) != 0)
6341 if (
Info->isEntryFunction())
6359 return Op.getValueType().bitsLE(VT) ?
6366 assert(
Op.getValueType() == MVT::f16 &&
6367 "Do not know how to custom lower FP_ROUND for non-f16 type");
6370 EVT SrcVT = Src.getValueType();
6371 if (SrcVT != MVT::f64)
6387 EVT VT =
Op.getValueType();
6390 bool IsIEEEMode =
Info->getMode().IEEE;
6399 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6407 EVT VT =
Op.getValueType();
6411 EVT ExpVT =
Exp.getValueType();
6412 if (ExpVT == MVT::i16)
6433 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6441 EVT VT =
Op.getValueType();
6447 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6474 if (
Op->isDivergent())
6487 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6489 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6492 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6494 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6500 EVT VT =
Op.getValueType();
6507 const APInt &
C = RHSC->getAPIntValue();
6509 if (
C.isPowerOf2()) {
6511 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6516 SL, VT, Result, ShiftAmt),
6536 if (
Op->isDivergent()) {
6553 return lowerTrapEndpgm(
Op, DAG);
6556 lowerTrapHsaQueuePtr(
Op, DAG);
6559SDValue SITargetLowering::lowerTrapEndpgm(
6567 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6577SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6587 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6593 if (UserSGPR == AMDGPU::NoRegister) {
6618SDValue SITargetLowering::lowerTrapHsa(
6639 "debugtrap handler not supported",
6655SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6659 ? AMDGPU::SRC_SHARED_BASE
6660 : AMDGPU::SRC_PRIVATE_BASE;
6683 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6692 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
6698 if (UserSGPR == AMDGPU::NoRegister) {
6705 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6728 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6729 isa<BasicBlockSDNode>(Val))
6732 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6733 return ConstVal->getSExtValue() !=
TM.getNullPointerValue(AddrSpace);
6747 unsigned DestAS, SrcAS;
6749 bool IsNonNull =
false;
6750 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
6751 SrcAS = ASC->getSrcAddressSpace();
6752 Src = ASC->getOperand(0);
6753 DestAS = ASC->getDestAddressSpace();
6756 Op.getConstantOperandVal(0) ==
6757 Intrinsic::amdgcn_addrspacecast_nonnull);
6758 Src =
Op->getOperand(1);
6759 SrcAS =
Op->getConstantOperandVal(2);
6760 DestAS =
Op->getConstantOperandVal(3);
6775 unsigned NullVal =
TM.getNullPointerValue(DestAS);
6789 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6797 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
6809 Op.getValueType() == MVT::i64) {
6818 Src.getValueType() == MVT::i64)
6842 EVT InsVT =
Ins.getValueType();
6845 unsigned IdxVal =
Idx->getAsZExtVal();
6850 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
6855 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6857 MVT::i32, InsNumElts / 2);
6862 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
6864 if (InsNumElts == 2) {
6877 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
6899 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
6900 if (NumElts == 4 && EltSize == 16 && KIdx) {
6911 unsigned Idx = KIdx->getZExtValue();
6912 bool InsertLo =
Idx < 2;
6914 InsertLo ? LoVec : HiVec,
6929 if (isa<ConstantSDNode>(
Idx))
6935 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
6941 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6957 DAG.
getNOT(SL, BFM, IntVT), BCVec);
6969 EVT ResultVT =
Op.getValueType();
6982 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
6985 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
6990 if (VecSize == 128) {
6998 }
else if (VecSize == 256) {
7001 for (
unsigned P = 0;
P < 4; ++
P) {
7007 Parts[0], Parts[1]));
7009 Parts[2], Parts[3]));
7015 for (
unsigned P = 0;
P < 8; ++
P) {
7022 Parts[0], Parts[1], Parts[2], Parts[3]));
7025 Parts[4], Parts[5],Parts[6], Parts[7]));
7028 EVT IdxVT =
Idx.getValueType();
7045 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7060 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7070 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7076 EVT ResultVT =
Op.getValueType();
7079 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7081 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7097 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7098 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7106 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7107 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7108 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7109 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7128 EVT ResultVT =
Op.getValueType();
7144 EVT VT =
Op.getValueType();
7146 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7147 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7166 { CastLo, CastHi });
7170 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7177 for (
unsigned P = 0;
P < 4; ++
P)
7178 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7181 for (
unsigned P = 0;
P < 4; ++
P) {
7191 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7198 for (
unsigned P = 0;
P < 8; ++
P)
7199 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7202 for (
unsigned P = 0;
P < 8; ++
P) {
7212 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7264 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7302 EVT PtrVT =
Op.getValueType();
7318 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7391 SDValue Param = lowerKernargMemParameter(
7401 "non-hsa intrinsic with hsa target",
7410 "intrinsic not supported on subtarget",
7420 unsigned NumElts = Elts.
size();
7422 if (NumElts <= 12) {
7431 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7437 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7438 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7447 EVT SrcVT = Src.getValueType();
7468 bool Unpacked,
bool IsD16,
int DMaskPop,
7469 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7472 EVT ReqRetVT = ResultTypes[0];
7474 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7475 ? (ReqRetNumElts + 1) / 2
7478 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7479 DMaskPop : (DMaskPop + 1) / 2;
7481 MVT DataDwordVT = NumDataDwords == 1 ?
7484 MVT MaskPopVT = MaskPopDwords == 1 ?
7490 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7501 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7503 NumDataDwords - MaskPopDwords);
7508 EVT LegalReqRetVT = ReqRetVT;
7510 if (!
Data.getValueType().isInteger())
7512 Data.getValueType().changeTypeToInteger(),
Data);
7533 if (Result->getNumValues() == 1)
7540 SDValue *LWE,
bool &IsTexFail) {
7541 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7560 unsigned DimIdx,
unsigned EndIdx,
7561 unsigned NumGradients) {
7563 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7571 if (((
I + 1) >= EndIdx) ||
7572 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7573 I == DimIdx + NumGradients - 1))) {
7574 if (
Addr.getValueType() != MVT::i16)
7595 unsigned IntrOpcode =
Intr->BaseOpcode;
7607 bool AdjustRetType =
false;
7608 bool IsAtomicPacked16Bit =
false;
7611 const unsigned ArgOffset = WithChain ? 2 : 1;
7614 unsigned DMaskLanes = 0;
7616 if (BaseOpcode->Atomic) {
7617 VData =
Op.getOperand(2);
7619 IsAtomicPacked16Bit =
7620 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7621 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7624 if (BaseOpcode->AtomicX2) {
7631 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7632 DMask = Is64Bit ? 0xf : 0x3;
7633 NumVDataDwords = Is64Bit ? 4 : 2;
7635 DMask = Is64Bit ? 0x3 : 0x1;
7636 NumVDataDwords = Is64Bit ? 2 : 1;
7639 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7642 if (BaseOpcode->Store) {
7643 VData =
Op.getOperand(2);
7651 VData = handleD16VData(VData, DAG,
true);
7668 (!LoadVT.
isVector() && DMaskLanes > 1))
7676 NumVDataDwords = (DMaskLanes + 1) / 2;
7678 NumVDataDwords = DMaskLanes;
7680 AdjustRetType =
true;
7684 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
7689 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
7691 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7692 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7694 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
7696 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7697 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7700 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
7701 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
7702 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
7707 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
7711 "Bias needs to be converted to 16 bit in A16 mode");
7716 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
7720 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
7721 "require 16 bit args for both gradients and addresses");
7726 if (!
ST->hasA16()) {
7727 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
7728 "support 16 bit addresses\n");
7738 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
7742 IntrOpcode = G16MappingInfo->
G16;
7750 ArgOffset +
Intr->GradientStart,
7751 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
7753 for (
unsigned I = ArgOffset +
Intr->GradientStart;
7754 I < ArgOffset + Intr->CoordStart;
I++)
7761 ArgOffset +
Intr->CoordStart, VAddrEnd,
7765 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
7783 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
7784 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
7785 const bool UseNSA =
ST->hasNSAEncoding() &&
7786 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
7787 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
7788 const bool UsePartialNSA =
7789 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
7792 if (UsePartialNSA) {
7794 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7803 if (!BaseOpcode->Sampler) {
7807 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
7809 Unorm = UnormConst ? True : False;
7814 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
7815 bool IsTexFail =
false;
7816 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7827 NumVDataDwords += 1;
7828 AdjustRetType =
true;
7833 if (AdjustRetType) {
7835 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7838 if (isa<MemSDNode>(
Op))
7843 EVT NewVT = NumVDataDwords > 1 ?
7847 ResultTypes[0] = NewVT;
7848 if (ResultTypes.size() == 3) {
7852 ResultTypes.erase(&ResultTypes[1]);
7856 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
7857 if (BaseOpcode->Atomic)
7864 if (BaseOpcode->Store || BaseOpcode->Atomic)
7866 if (UsePartialNSA) {
7875 if (BaseOpcode->Sampler)
7880 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7884 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7892 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7896 if (BaseOpcode->HasD16)
7898 if (isa<MemSDNode>(
Op))
7901 int NumVAddrDwords =
7907 NumVDataDwords, NumVAddrDwords);
7908 }
else if (IsGFX11Plus) {
7910 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7911 : AMDGPU::MIMGEncGfx11Default,
7912 NumVDataDwords, NumVAddrDwords);
7913 }
else if (IsGFX10Plus) {
7915 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7916 : AMDGPU::MIMGEncGfx10Default,
7917 NumVDataDwords, NumVAddrDwords);
7921 NumVDataDwords, NumVAddrDwords);
7924 "requested image instruction is not supported on this GPU");
7929 NumVDataDwords, NumVAddrDwords);
7932 NumVDataDwords, NumVAddrDwords);
7938 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
7943 if (BaseOpcode->AtomicX2) {
7948 if (BaseOpcode->Store)
7952 NumVDataDwords, IsAtomicPacked16Bit,
DL);
7970 if (!
Offset->isDivergent()) {
8015 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8019 unsigned NumLoads = 1;
8025 if (NumElts == 8 || NumElts == 16) {
8026 NumLoads = NumElts / 4;
8034 setBufferOffsets(
Offset, DAG, &Ops[3],
8035 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8038 for (
unsigned i = 0; i < NumLoads; ++i) {
8044 if (NumElts == 8 || NumElts == 16)
8091 EVT VT =
Op.getValueType();
8093 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8097 switch (IntrinsicID) {
8098 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8101 return getPreloadedValue(DAG, *MFI, VT,
8104 case Intrinsic::amdgcn_dispatch_ptr:
8105 case Intrinsic::amdgcn_queue_ptr: {
8108 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8114 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8116 return getPreloadedValue(DAG, *MFI, VT, RegID);
8118 case Intrinsic::amdgcn_implicitarg_ptr: {
8120 return getImplicitArgPtr(DAG,
DL);
8121 return getPreloadedValue(DAG, *MFI, VT,
8124 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8130 return getPreloadedValue(DAG, *MFI, VT,
8133 case Intrinsic::amdgcn_dispatch_id: {
8136 case Intrinsic::amdgcn_rcp:
8138 case Intrinsic::amdgcn_rsq:
8140 case Intrinsic::amdgcn_rsq_legacy:
8144 case Intrinsic::amdgcn_rcp_legacy:
8148 case Intrinsic::amdgcn_rsq_clamp: {
8162 case Intrinsic::r600_read_ngroups_x:
8166 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8169 case Intrinsic::r600_read_ngroups_y:
8173 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8176 case Intrinsic::r600_read_ngroups_z:
8180 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8183 case Intrinsic::r600_read_global_size_x:
8187 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8190 case Intrinsic::r600_read_global_size_y:
8194 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8197 case Intrinsic::r600_read_global_size_z:
8201 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8204 case Intrinsic::r600_read_local_size_x:
8208 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8210 case Intrinsic::r600_read_local_size_y:
8214 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8216 case Intrinsic::r600_read_local_size_z:
8220 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8222 case Intrinsic::amdgcn_workgroup_id_x:
8223 return getPreloadedValue(DAG, *MFI, VT,
8225 case Intrinsic::amdgcn_workgroup_id_y:
8226 return getPreloadedValue(DAG, *MFI, VT,
8228 case Intrinsic::amdgcn_workgroup_id_z:
8229 return getPreloadedValue(DAG, *MFI, VT,
8231 case Intrinsic::amdgcn_wave_id:
8232 return lowerWaveID(DAG,
Op);
8233 case Intrinsic::amdgcn_lds_kernel_id: {
8235 return getLDSKernelId(DAG,
DL);
8236 return getPreloadedValue(DAG, *MFI, VT,
8239 case Intrinsic::amdgcn_workitem_id_x:
8240 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8241 case Intrinsic::amdgcn_workitem_id_y:
8242 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8243 case Intrinsic::amdgcn_workitem_id_z:
8244 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8245 case Intrinsic::amdgcn_wavefrontsize:
8248 case Intrinsic::amdgcn_s_buffer_load: {
8249 unsigned CPol =
Op.getConstantOperandVal(3);
8256 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8259 case Intrinsic::amdgcn_fdiv_fast:
8260 return lowerFDIV_FAST(
Op, DAG);
8261 case Intrinsic::amdgcn_sin:
8264 case Intrinsic::amdgcn_cos:
8267 case Intrinsic::amdgcn_mul_u24:
8269 case Intrinsic::amdgcn_mul_i24:
8272 case Intrinsic::amdgcn_log_clamp: {
8278 case Intrinsic::amdgcn_fract:
8281 case Intrinsic::amdgcn_class:
8283 Op.getOperand(1),
Op.getOperand(2));
8284 case Intrinsic::amdgcn_div_fmas:
8286 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8289 case Intrinsic::amdgcn_div_fixup:
8291 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8293 case Intrinsic::amdgcn_div_scale: {
8306 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8309 Denominator, Numerator);
8311 case Intrinsic::amdgcn_icmp: {
8313 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8314 Op.getConstantOperandVal(2) == 0 &&
8319 case Intrinsic::amdgcn_fcmp: {
8322 case Intrinsic::amdgcn_ballot:
8324 case Intrinsic::amdgcn_fmed3:
8326 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8327 case Intrinsic::amdgcn_fdot2:
8329 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8331 case Intrinsic::amdgcn_fmul_legacy:
8333 Op.getOperand(1),
Op.getOperand(2));
8334 case Intrinsic::amdgcn_sffbh:
8336 case Intrinsic::amdgcn_sbfe:
8338 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8339 case Intrinsic::amdgcn_ubfe:
8341 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8342 case Intrinsic::amdgcn_cvt_pkrtz:
8343 case Intrinsic::amdgcn_cvt_pknorm_i16:
8344 case Intrinsic::amdgcn_cvt_pknorm_u16:
8345 case Intrinsic::amdgcn_cvt_pk_i16:
8346 case Intrinsic::amdgcn_cvt_pk_u16: {
8348 EVT VT =
Op.getValueType();
8351 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8353 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8355 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8357 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8363 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8366 Op.getOperand(1),
Op.getOperand(2));
8369 case Intrinsic::amdgcn_fmad_ftz:
8371 Op.getOperand(2),
Op.getOperand(3));
8373 case Intrinsic::amdgcn_if_break:
8375 Op->getOperand(1),
Op->getOperand(2)), 0);
8377 case Intrinsic::amdgcn_groupstaticsize: {
8389 case Intrinsic::amdgcn_is_shared:
8390 case Intrinsic::amdgcn_is_private: {
8392 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8394 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8402 case Intrinsic::amdgcn_perm:
8404 Op.getOperand(2),
Op.getOperand(3));
8405 case Intrinsic::amdgcn_reloc_constant: {
8409 auto RelocSymbol = cast<GlobalVariable>(
8415 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8416 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8417 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8418 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8419 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8420 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8421 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8422 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8423 if (
Op.getOperand(4).getValueType() == MVT::i32)
8429 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8430 Op.getOperand(3), IndexKeyi32);
8432 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8433 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8434 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8435 if (
Op.getOperand(6).getValueType() == MVT::i32)
8441 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8442 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8443 IndexKeyi32, Op.getOperand(7)});
8445 case Intrinsic::amdgcn_addrspacecast_nonnull:
8446 return lowerADDRSPACECAST(
Op, DAG);
8450 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8461 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8467 unsigned NewOpcode)
const {
8471 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8472 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8486 auto *
M = cast<MemSDNode>(
Op);
8490 M->getMemOperand());
8501 unsigned NewOpcode)
const {
8505 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8506 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8520 auto *
M = cast<MemSDNode>(
Op);
8524 M->getMemOperand());
8529 unsigned IntrID =
Op.getConstantOperandVal(1);
8533 case Intrinsic::amdgcn_ds_ordered_add:
8534 case Intrinsic::amdgcn_ds_ordered_swap: {
8539 unsigned IndexOperand =
M->getConstantOperandVal(7);
8540 unsigned WaveRelease =
M->getConstantOperandVal(8);
8541 unsigned WaveDone =
M->getConstantOperandVal(9);
8543 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8544 IndexOperand &= ~0x3f;
8545 unsigned CountDw = 0;
8548 CountDw = (IndexOperand >> 24) & 0xf;
8549 IndexOperand &= ~(0xf << 24);
8551 if (CountDw < 1 || CountDw > 4) {
8553 "ds_ordered_count: dword count must be between 1 and 4");
8560 if (WaveDone && !WaveRelease)
8563 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8564 unsigned ShaderType =
8566 unsigned Offset0 = OrderedCountIndex << 2;
8567 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8570 Offset1 |= (CountDw - 1) << 6;
8573 Offset1 |= ShaderType << 2;
8575 unsigned Offset = Offset0 | (Offset1 << 8);
8584 M->getVTList(), Ops,
M->getMemoryVT(),
8585 M->getMemOperand());
8587 case Intrinsic::amdgcn_ds_fadd: {
8591 case Intrinsic::amdgcn_ds_fadd:
8597 M->getOperand(0),
M->getOperand(2),
M->getOperand(3),
8598 M->getMemOperand());
8600 case Intrinsic::amdgcn_ds_fmin:
8601 case Intrinsic::amdgcn_ds_fmax: {
8605 case Intrinsic::amdgcn_ds_fmin:
8608 case Intrinsic::amdgcn_ds_fmax:
8621 M->getMemoryVT(),
M->getMemOperand());
8623 case Intrinsic::amdgcn_buffer_load:
8624 case Intrinsic::amdgcn_buffer_load_format: {
8625 unsigned Glc =
Op.getConstantOperandVal(5);
8626 unsigned Slc =
Op.getConstantOperandVal(6);
8638 setBufferOffsets(
Op.getOperand(4), DAG, &Ops[3]);
8640 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8643 EVT VT =
Op.getValueType();
8645 auto *
M = cast<MemSDNode>(
Op);
8646 EVT LoadVT =
Op.getValueType();
8654 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
8655 M->getMemOperand());
8657 return getMemIntrinsicNode(Opc,
DL,
Op->getVTList(), Ops, IntVT,
8658 M->getMemOperand(), DAG);
8660 case Intrinsic::amdgcn_raw_buffer_load:
8661 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8662 case Intrinsic::amdgcn_raw_buffer_load_format:
8663 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8664 const bool IsFormat =
8665 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8666 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8668 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8669 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8682 auto *
M = cast<MemSDNode>(
Op);
8683 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8685 case Intrinsic::amdgcn_struct_buffer_load:
8686 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8687 case Intrinsic::amdgcn_struct_buffer_load_format:
8688 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8689 const bool IsFormat =
8690 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8691 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8693 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8694 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8707 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8709 case Intrinsic::amdgcn_tbuffer_load: {
8711 EVT LoadVT =
Op.getValueType();
8714 unsigned Dfmt =
Op.getConstantOperandVal(7);
8715 unsigned Nfmt =
Op.getConstantOperandVal(8);
8716 unsigned Glc =
Op.getConstantOperandVal(9);
8717 unsigned Slc =
Op.getConstantOperandVal(10);
8735 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8738 case Intrinsic::amdgcn_raw_tbuffer_load:
8739 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8741 EVT LoadVT =
Op.getValueType();
8742 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8743 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8762 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8765 case Intrinsic::amdgcn_struct_tbuffer_load:
8766 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8768 EVT LoadVT =
Op.getValueType();
8769 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8770 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8789 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8792 case Intrinsic::amdgcn_buffer_atomic_swap:
8793 case Intrinsic::amdgcn_buffer_atomic_add:
8794 case Intrinsic::amdgcn_buffer_atomic_sub:
8795 case Intrinsic::amdgcn_buffer_atomic_csub:
8796 case Intrinsic::amdgcn_buffer_atomic_smin:
8797 case Intrinsic::amdgcn_buffer_atomic_umin:
8798 case Intrinsic::amdgcn_buffer_atomic_smax:
8799 case Intrinsic::amdgcn_buffer_atomic_umax:
8800 case Intrinsic::amdgcn_buffer_atomic_and:
8801 case Intrinsic::amdgcn_buffer_atomic_or:
8802 case Intrinsic::amdgcn_buffer_atomic_xor:
8803 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8804 unsigned Slc =
Op.getConstantOperandVal(6);
8817 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
8819 EVT VT =
Op.getValueType();
8821 auto *
M = cast<MemSDNode>(
Op);
8822 unsigned Opcode = 0;
8825 case Intrinsic::amdgcn_buffer_atomic_swap:
8828 case Intrinsic::amdgcn_buffer_atomic_add:
8831 case Intrinsic::amdgcn_buffer_atomic_sub:
8834 case Intrinsic::amdgcn_buffer_atomic_csub:
8837 case Intrinsic::amdgcn_buffer_atomic_smin:
8840 case Intrinsic::amdgcn_buffer_atomic_umin:
8843 case Intrinsic::amdgcn_buffer_atomic_smax:
8846 case Intrinsic::amdgcn_buffer_atomic_umax:
8849 case Intrinsic::amdgcn_buffer_atomic_and:
8852 case Intrinsic::amdgcn_buffer_atomic_or:
8855 case Intrinsic::amdgcn_buffer_atomic_xor:
8858 case Intrinsic::amdgcn_buffer_atomic_fadd:
8866 M->getMemOperand());
8868 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8871 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8872 return lowerRawBufferAtomicIntrin(
Op, DAG,
8874 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8875 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8877 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8878 return lowerStructBufferAtomicIntrin(
Op, DAG,
8880 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8881 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8883 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8886 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8889 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8890 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8892 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8893 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8895 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8898 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8901 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8904 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8905 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8907 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8910 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8913 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8916 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8919 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8922 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8925 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8928 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8929 return lowerRawBufferAtomicIntrin(
Op, DAG,
8931 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8933 return lowerStructBufferAtomicIntrin(
Op, DAG,
8935 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8938 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8941 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8942 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8943 return lowerStructBufferAtomicIntrin(
Op, DAG,
8945 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8946 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8947 return lowerStructBufferAtomicIntrin(
Op, DAG,
8949 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8950 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8951 return lowerStructBufferAtomicIntrin(
Op, DAG,
8953 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8955 return lowerStructBufferAtomicIntrin(
Op, DAG,
8957 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8960 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8961 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8963 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8966 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8967 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8969 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8972 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8973 return lowerStructBufferAtomicIntrin(
Op, DAG,
8976 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8977 unsigned Slc =
Op.getConstantOperandVal(7);
8991 setBufferOffsets(
Op.getOperand(6), DAG, &Ops[5]);
8993 EVT VT =
Op.getValueType();
8994 auto *
M = cast<MemSDNode>(
Op);
8997 Op->getVTList(), Ops, VT,
M->getMemOperand());
8999 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9000 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9001 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9002 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9016 EVT VT =
Op.getValueType();
9017 auto *
M = cast<MemSDNode>(
Op);
9020 Op->getVTList(), Ops, VT,
M->getMemOperand());
9022 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9023 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9024 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9025 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
9039 EVT VT =
Op.getValueType();
9040 auto *
M = cast<MemSDNode>(
Op);
9043 Op->getVTList(), Ops, VT,
M->getMemOperand());
9045 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9047 SDValue NodePtr =
M->getOperand(2);
9048 SDValue RayExtent =
M->getOperand(3);
9049 SDValue RayOrigin =
M->getOperand(4);
9051 SDValue RayInvDir =
M->getOperand(6);
9069 const unsigned NumVDataDwords = 4;
9070 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9071 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9075 const unsigned BaseOpcodes[2][2] = {
9076 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9077 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9078 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9082 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9083 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9084 : AMDGPU::MIMGEncGfx10NSA,
9085 NumVDataDwords, NumVAddrDwords);
9089 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9090 : AMDGPU::MIMGEncGfx10Default,
9091 NumVDataDwords, NumVAddrDwords);
9097 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
9100 if (Lanes[0].getValueSizeInBits() == 32) {
9101 for (
unsigned I = 0;
I < 3; ++
I)
9108 { Lanes[0], Lanes[1] })));
9115 { Elt0, Lanes[0] })));
9119 { Lanes[1], Lanes[2] })));
9124 if (UseNSA && IsGFX11Plus) {
9132 for (
unsigned I = 0;
I < 3; ++
I) {
9135 {DirLanes[I], InvDirLanes[I]})));
9150 packLanes(RayOrigin,
true);
9151 packLanes(RayDir,
true);
9152 packLanes(RayInvDir,
false);
9157 if (NumVAddrDwords > 12) {
9177 case Intrinsic::amdgcn_global_atomic_fmin:
9178 case Intrinsic::amdgcn_global_atomic_fmax:
9179 case Intrinsic::amdgcn_global_atomic_fmin_num:
9180 case Intrinsic::amdgcn_global_atomic_fmax_num:
9181 case Intrinsic::amdgcn_flat_atomic_fmin:
9182 case Intrinsic::amdgcn_flat_atomic_fmax:
9183 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9184 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9191 unsigned Opcode = 0;
9193 case Intrinsic::amdgcn_global_atomic_fmin:
9194 case Intrinsic::amdgcn_global_atomic_fmin_num:
9195 case Intrinsic::amdgcn_flat_atomic_fmin:
9196 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9200 case Intrinsic::amdgcn_global_atomic_fmax:
9201 case Intrinsic::amdgcn_global_atomic_fmax_num:
9202 case Intrinsic::amdgcn_flat_atomic_fmax:
9203 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9211 M->getVTList(), Ops,
M->getMemoryVT(),
9212 M->getMemOperand());
9214 case Intrinsic::amdgcn_s_get_barrier_state: {
9218 bool IsInlinableBarID =
false;
9221 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9222 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9226 if (IsInlinableBarID) {
9227 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9231 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9243 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9251SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9261 bool IsTFE = VTList.
NumVTs == 3;
9264 unsigned NumOpDWords = NumValueDWords + 1;
9269 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9270 OpDWordsVT, OpDWordsMMO, DAG);
9285 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9291 WidenedMemVT, WidenedMMO);
9301 bool ImageStore)
const {
9336 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9342 if ((NumElements % 2) == 1) {
9344 unsigned I = Elts.
size() / 2;
9360 if (NumElements == 3) {
9381 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9384 switch (IntrinsicID) {
9385 case Intrinsic::amdgcn_exp_compr: {
9389 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9412 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9415 case Intrinsic::amdgcn_s_barrier: {
9418 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9419 if (WGSize <=
ST.getWavefrontSize())
9421 Op.getOperand(0)), 0);
9425 if (
ST.hasSplitBarriers()) {
9430 MVT::Other, K,
Op.getOperand(0)),
9441 case Intrinsic::amdgcn_tbuffer_store: {
9445 VData = handleD16VData(VData, DAG);
9446 unsigned Dfmt =
Op.getConstantOperandVal(8);
9447 unsigned Nfmt =
Op.getConstantOperandVal(9);
9448 unsigned Glc =
Op.getConstantOperandVal(10);
9449 unsigned Slc =
Op.getConstantOperandVal(11);
9467 M->getMemoryVT(),
M->getMemOperand());
9470 case Intrinsic::amdgcn_struct_tbuffer_store:
9471 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9475 VData = handleD16VData(VData, DAG);
9476 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9477 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9495 M->getMemoryVT(),
M->getMemOperand());
9498 case Intrinsic::amdgcn_raw_tbuffer_store:
9499 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9503 VData = handleD16VData(VData, DAG);
9504 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9505 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9523 M->getMemoryVT(),
M->getMemOperand());
9526 case Intrinsic::amdgcn_buffer_store:
9527 case Intrinsic::amdgcn_buffer_store_format: {
9531 VData = handleD16VData(VData, DAG);
9532 unsigned Glc =
Op.getConstantOperandVal(6);
9533 unsigned Slc =
Op.getConstantOperandVal(7);
9546 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
9548 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9555 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9556 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9559 M->getMemoryVT(),
M->getMemOperand());
9562 case Intrinsic::amdgcn_raw_buffer_store:
9563 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9564 case Intrinsic::amdgcn_raw_buffer_store_format:
9565 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9566 const bool IsFormat =
9567 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9568 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9575 VData = handleD16VData(VData, DAG);
9585 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9586 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9606 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9609 M->getMemoryVT(),
M->getMemOperand());
9612 case Intrinsic::amdgcn_struct_buffer_store:
9613 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9614 case Intrinsic::amdgcn_struct_buffer_store_format:
9615 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9616 const bool IsFormat =
9617 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9618 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9626 VData = handleD16VData(VData, DAG);
9636 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9637 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9658 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9661 M->getMemoryVT(),
M->getMemOperand());
9663 case Intrinsic::amdgcn_raw_buffer_load_lds:
9664 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9665 case Intrinsic::amdgcn_struct_buffer_load_lds:
9666 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9670 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9671 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9672 unsigned OpOffset = HasVIndex ? 1 : 0;
9673 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9675 unsigned Size =
Op->getConstantOperandVal(4);
9681 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9682 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9683 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9684 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9687 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9688 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9689 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9690 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9693 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9694 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9695 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9696 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9704 if (HasVIndex && HasVOffset)
9710 else if (HasVOffset)
9713 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9717 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9725 auto *
M = cast<MemSDNode>(
Op);
9752 case Intrinsic::amdgcn_global_load_lds: {
9754 unsigned Size =
Op->getConstantOperandVal(4);
9759 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9762 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9765 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9769 auto *
M = cast<MemSDNode>(
Op);
9782 if (
LHS->isDivergent())
9786 RHS.getOperand(0).getValueType() == MVT::i32) {
9789 VOffset =
RHS.getOperand(0);
9794 if (!
Addr->isDivergent()) {
9810 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9830 case Intrinsic::amdgcn_end_cf:
9832 Op->getOperand(2), Chain), 0);
9833 case Intrinsic::amdgcn_s_barrier_init:
9834 case Intrinsic::amdgcn_s_barrier_join:
9835 case Intrinsic::amdgcn_s_wakeup_barrier: {
9840 bool IsInlinableBarID =
false;
9843 if (isa<ConstantSDNode>(BarOp)) {
9844 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9848 if (IsInlinableBarID) {
9849 switch (IntrinsicID) {
9852 case Intrinsic::amdgcn_s_barrier_init:
9853 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9855 case Intrinsic::amdgcn_s_barrier_join:
9856 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9858 case Intrinsic::amdgcn_s_wakeup_barrier:
9859 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9866 switch (IntrinsicID) {
9869 case Intrinsic::amdgcn_s_barrier_init:
9870 Opc = AMDGPU::S_BARRIER_INIT_M0;
9872 case Intrinsic::amdgcn_s_barrier_join:
9873 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9875 case Intrinsic::amdgcn_s_wakeup_barrier:
9876 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9881 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9887 if (!IsInlinableBarID) {
9892 Op.getOperand(2), M0Val),
9896 }
else if (!IsInlinableBarID) {
9906 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9919std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9926 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9943 unsigned Overflow = ImmOffset & ~MaxImm;
9944 ImmOffset -= Overflow;
9945 if ((int32_t)Overflow < 0) {
9946 Overflow += ImmOffset;
9955 SDValue Ops[] = { N0, OverflowVal };
9970void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
9972 Align Alignment)
const {
9975 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
9978 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9989 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
9991 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10008SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10011 return MaybePointer;
10027 SDValue NumRecords =
Op->getOperand(3);
10030 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10033 std::optional<uint32_t> ConstStride = std::nullopt;
10034 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10035 ConstStride = ConstNode->getZExtValue();
10037 SDValue NewHighHalf = Masked;
10038 if (!ConstStride || *ConstStride != 0) {
10041 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10048 NewHighHalf = DAG.
getNode(
ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10052 NewHighHalf, NumRecords, Flags);
10059SITargetLowering::handleByteShortBufferLoads(
SelectionDAG &DAG,
EVT LoadVT,
10080 if (VDataType == MVT::f16)
10084 Ops[1] = BufferStoreExt;
10089 M->getMemOperand());
10114SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10130 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10137 "unexpected vector extload");
10150 "unexpected fp extload");
10168 DCI.AddToWorklist(Cvt.
getNode());
10173 DCI.AddToWorklist(Cvt.
getNode());
10184 if (
Info.isEntryFunction())
10185 return Info.getUserSGPRInfo().hasFlatScratchInit();
10193 EVT MemVT =
Load->getMemoryVT();
10206 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10209 BasePtr, RealMemVT, MMO);
10239 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10240 "Custom lowering for non-i32 vectors hasn't been implemented.");
10243 unsigned AS =
Load->getAddressSpace();
10262 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10279 Alignment >=
Align(4) && NumElements < 32) {
10294 if (NumElements > 4)
10314 if (NumElements > 2)
10319 if (NumElements > 4)
10331 auto Flags =
Load->getMemOperand()->getFlags();
10333 Load->getAlign(), Flags, &
Fast) &&
10342 MemVT, *
Load->getMemOperand())) {
10352 EVT VT =
Op.getValueType();
10389 EVT VT =
Op.getValueType();
10392 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10399 if (!AllowInaccurateRcp && VT != MVT::f16)
10402 if (CLHS->isExactlyValue(1.0)) {
10419 if (CLHS->isExactlyValue(-1.0)) {
10428 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10442 EVT VT =
Op.getValueType();
10445 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10447 if (!AllowInaccurateDiv)
10468 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10481 return DAG.
getNode(Opcode, SL, VTList,
10490 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10503 return DAG.
getNode(Opcode, SL, VTList,
10509 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10510 return FastLowered;
10537 const APFloat K0Val(0x1p+96f);
10540 const APFloat K1Val(0x1p-32f);
10567 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10568 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10569 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10574 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10575 return FastLowered;
10582 Flags.setNoFPExcept(
true);
10599 DenominatorScaled, Flags);
10601 DenominatorScaled, Flags);
10603 using namespace AMDGPU::Hwreg;
10604 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10612 const bool HasDynamicDenormals =
10618 if (!PreservesDenormals) {
10626 if (HasDynamicDenormals) {
10630 SavedDenormMode =
SDValue(GetReg, 0);
10638 const SDValue EnableDenormValue =
10647 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10648 {EnableDenormValue,
BitField, Glue});
10661 ApproxRcp, One, NegDivScale0, Flags);
10664 ApproxRcp, Fma0, Flags);
10667 Fma1, Fma1, Flags);
10670 NumeratorScaled,
Mul, Flags);
10673 Fma2, Fma1,
Mul, Fma2, Flags);
10676 NumeratorScaled, Fma3, Flags);
10678 if (!PreservesDenormals) {
10685 Fma4.
getValue(1), DisableDenormValue,
10688 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10689 const SDValue DisableDenormValue =
10690 HasDynamicDenormals
10695 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10706 {Fma4, Fma1, Fma3, Scale},
Flags);
10712 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10713 return FastLowered;
10741 NegDivScale0,
Mul, DivScale1);
10773 Fma4, Fma3,
Mul, Scale);
10779 EVT VT =
Op.getValueType();
10781 if (VT == MVT::f32)
10782 return LowerFDIV32(
Op, DAG);
10784 if (VT == MVT::f64)
10785 return LowerFDIV64(
Op, DAG);
10787 if (VT == MVT::f16)
10788 return LowerFDIV16(
Op, DAG);
10797 EVT ResultExpVT =
Op->getValueType(1);
10798 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10828 if (VT == MVT::i1) {
10831 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10835 Store->getValue().getValueType().getScalarType() == MVT::i32);
10837 unsigned AS =
Store->getAddressSpace();
10856 if (NumElements > 4)
10863 VT, *
Store->getMemOperand()))
10872 if (NumElements > 2)
10876 if (NumElements > 4 ||
10885 auto Flags =
Store->getMemOperand()->getFlags();
10920 MVT VT =
Op.getValueType().getSimpleVT();
11089 EVT VT =
Op.getValueType();
11106 switch (
Op.getOpcode()) {
11132 EVT VT =
Op.getValueType();
11148 DAGCombinerInfo &DCI)
const {
11149 EVT VT =
N->getValueType(0);
11151 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11158 EVT SrcVT = Src.getValueType();
11164 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11167 DCI.AddToWorklist(Cvt.
getNode());
11170 if (ScalarVT != MVT::f32) {
11182 DAGCombinerInfo &DCI)
const {
11183 SDValue MagnitudeOp =
N->getOperand(0);
11184 SDValue SignOp =
N->getOperand(1);
11242 unsigned AddrSpace,
11244 DAGCombinerInfo &DCI)
const {
11274 AM.HasBaseReg =
true;
11275 AM.BaseOffs =
Offset.getSExtValue();
11280 EVT VT =
N->getValueType(0);
11286 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11297 switch (
N->getOpcode()) {
11308 DAGCombinerInfo &DCI)
const {
11317 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11318 N->getMemoryVT(), DCI);
11322 NewOps[PtrIdx] = NewPtr;
11331 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11332 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11341SDValue SITargetLowering::splitBinaryBitConstantOp(
11342 DAGCombinerInfo &DCI,
11364 if (V.getValueType() != MVT::i1)
11366 switch (V.getOpcode()) {
11385 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11386 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11387 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11388 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11389 uint32_t NonZeroByteMask = ~ZeroByteMask;
11390 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11403 assert(V.getValueSizeInBits() == 32);
11405 if (V.getNumOperands() != 2)
11414 switch (V.getOpcode()) {
11419 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11424 return (0x03020100 & ~ConstMask) | ConstMask;
11431 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11437 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11444 DAGCombinerInfo &DCI)
const {
11445 if (DCI.isBeforeLegalize())
11449 EVT VT =
N->getValueType(0);
11455 if (VT == MVT::i64 && CRHS) {
11461 if (CRHS && VT == MVT::i32) {
11470 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11471 unsigned Shift = CShift->getZExtValue();
11473 unsigned Offset = NB + Shift;
11474 if ((
Offset & (Bits - 1)) == 0) {
11477 LHS->getOperand(0),
11492 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11498 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11513 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11518 if (
X !=
LHS.getOperand(1))
11556 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11557 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11560 Mask->getZExtValue() & ~OrdMask :
11561 Mask->getZExtValue() & OrdMask;
11569 if (VT == MVT::i32 &&
11582 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11585 if (LHSMask != ~0u && RHSMask != ~0u) {
11588 if (LHSMask > RHSMask) {
11595 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11596 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11599 if (!(LHSUsedLanes & RHSUsedLanes) &&
11602 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11609 for (
unsigned I = 0;
I < 32;
I += 8) {
11611 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11612 Mask &= (0x0c <<
I) & 0xffffffff;
11621 LHS.getOperand(0),
RHS.getOperand(0),
11670static const std::optional<ByteProvider<SDValue>>
11672 unsigned Depth = 0) {
11675 return std::nullopt;
11677 if (
Op.getValueSizeInBits() < 8)
11678 return std::nullopt;
11680 if (
Op.getValueType().isVector())
11683 switch (
Op->getOpcode()) {
11694 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11695 NarrowVT = VTSign->getVT();
11698 return std::nullopt;
11701 if (SrcIndex >= NarrowByteWidth)
11702 return std::nullopt;
11708 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11710 return std::nullopt;
11712 uint64_t BitShift = ShiftOp->getZExtValue();
11714 if (BitShift % 8 != 0)
11715 return std::nullopt;
11717 SrcIndex += BitShift / 8;
11735static const std::optional<ByteProvider<SDValue>>
11737 unsigned StartingIndex = 0) {
11741 return std::nullopt;
11743 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11745 return std::nullopt;
11747 return std::nullopt;
11749 bool IsVec =
Op.getValueType().isVector();
11750 switch (
Op.getOpcode()) {
11753 return std::nullopt;
11758 return std::nullopt;
11762 return std::nullopt;
11765 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11766 return std::nullopt;
11767 if (!
LHS ||
LHS->isConstantZero())
11769 if (!
RHS ||
RHS->isConstantZero())
11771 return std::nullopt;
11776 return std::nullopt;
11778 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11780 return std::nullopt;
11782 uint32_t BitMask = BitMaskOp->getZExtValue();
11786 if ((IndexMask & BitMask) != IndexMask) {
11789 if (IndexMask & BitMask)
11790 return std::nullopt;
11799 return std::nullopt;
11802 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11803 if (!ShiftOp ||
Op.getValueType().isVector())
11804 return std::nullopt;
11806 uint64_t BitsProvided =
Op.getValueSizeInBits();
11807 if (BitsProvided % 8 != 0)
11808 return std::nullopt;
11810 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11812 return std::nullopt;
11814 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11815 uint64_t ByteShift = BitShift / 8;
11817 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11818 uint64_t BytesProvided = BitsProvided / 8;
11819 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11820 NewIndex %= BytesProvided;
11827 return std::nullopt;
11829 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11831 return std::nullopt;
11833 uint64_t BitShift = ShiftOp->getZExtValue();
11835 return std::nullopt;
11837 auto BitsProvided =
Op.getScalarValueSizeInBits();
11838 if (BitsProvided % 8 != 0)
11839 return std::nullopt;
11841 uint64_t BytesProvided = BitsProvided / 8;
11842 uint64_t ByteShift = BitShift / 8;
11847 return BytesProvided - ByteShift >
Index
11855 return std::nullopt;
11857 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11859 return std::nullopt;
11861 uint64_t BitShift = ShiftOp->getZExtValue();
11862 if (BitShift % 8 != 0)
11863 return std::nullopt;
11864 uint64_t ByteShift = BitShift / 8;
11870 return Index < ByteShift
11873 Depth + 1, StartingIndex);
11882 return std::nullopt;
11889 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11890 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11892 if (NarrowBitWidth % 8 != 0)
11893 return std::nullopt;
11894 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11896 if (
Index >= NarrowByteWidth)
11898 ? std::optional<ByteProvider<SDValue>>(
11906 return std::nullopt;
11910 if (NarrowByteWidth >=
Index) {
11915 return std::nullopt;
11922 return std::nullopt;
11926 auto L = cast<LoadSDNode>(
Op.getNode());
11928 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11929 if (NarrowBitWidth % 8 != 0)
11930 return std::nullopt;
11931 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11936 if (
Index >= NarrowByteWidth) {
11938 ? std::optional<ByteProvider<SDValue>>(
11943 if (NarrowByteWidth >
Index) {
11947 return std::nullopt;
11952 return std::nullopt;
11955 Depth + 1, StartingIndex);
11959 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11961 return std::nullopt;
11962 auto VecIdx = IdxOp->getZExtValue();
11963 auto ScalarSize =
Op.getScalarValueSizeInBits();
11964 if (ScalarSize != 32) {
11965 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
11969 StartingIndex,
Index);
11974 return std::nullopt;
11976 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11978 return std::nullopt;
11981 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
11982 if (IdxMask > 0x07 && IdxMask != 0x0c)
11983 return std::nullopt;
11985 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
11986 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
11988 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
11994 return std::nullopt;
12009 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12013 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12016 auto MemVT = L->getMemoryVT();
12019 return L->getMemoryVT().getSizeInBits() == 16;
12029 int Low8 = Mask & 0xff;
12030 int Hi8 = (Mask & 0xff00) >> 8;
12032 assert(Low8 < 8 && Hi8 < 8);
12034 bool IsConsecutive = (Hi8 - Low8 == 1);
12039 bool Is16Aligned = !(Low8 % 2);
12041 return IsConsecutive && Is16Aligned;
12049 int Low16 = PermMask & 0xffff;
12050 int Hi16 = (PermMask & 0xffff0000) >> 16;
12060 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12062 if (!OtherOpIs16Bit)
12070 unsigned DWordOffset) {
12073 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12075 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12080 if (Src.getValueType().isVector()) {
12081 auto ScalarTySize = Src.getScalarValueSizeInBits();
12082 auto ScalarTy = Src.getValueType().getScalarType();
12083 if (ScalarTySize == 32) {
12087 if (ScalarTySize > 32) {
12090 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12091 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12098 assert(ScalarTySize < 32);
12099 auto NumElements =
TypeSize / ScalarTySize;
12100 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12101 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12102 auto NumElementsIn32 = 32 / ScalarTySize;
12103 auto NumAvailElements = DWordOffset < Trunc32Elements
12105 : NumElements - NormalizedTrunc;
12118 auto ShiftVal = 32 * DWordOffset;
12126 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12131 for (
int i = 0; i < 4; i++) {
12133 std::optional<ByteProvider<SDValue>>
P =
12136 if (!
P ||
P->isConstantZero())
12141 if (PermNodes.
size() != 4)
12144 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12145 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12147 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12148 auto PermOp = PermNodes[i];
12151 int SrcByteAdjust = 4;
12155 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12156 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12158 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12159 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12163 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12164 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12167 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12169 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12172 SDValue Op = *PermNodes[FirstSrc.first].Src;
12174 assert(
Op.getValueSizeInBits() == 32);
12178 int Low16 = PermMask & 0xffff;
12179 int Hi16 = (PermMask & 0xffff0000) >> 16;
12181 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12182 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12185 if (WellFormedLow && WellFormedHi)
12189 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12198 assert(
Op.getValueType().isByteSized() &&
12216 DAGCombinerInfo &DCI)
const {
12221 EVT VT =
N->getValueType(0);
12222 if (VT == MVT::i1) {
12227 if (Src !=
RHS.getOperand(0))
12232 if (!CLHS || !CRHS)
12236 static const uint32_t MaxMask = 0x3ff;
12250 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12255 Sel |=
LHS.getConstantOperandVal(2);
12264 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12268 auto usesCombinedOperand = [](
SDNode *OrUse) {
12271 !OrUse->getValueType(0).isVector())
12275 for (
auto VUse : OrUse->uses()) {
12276 if (!VUse->getValueType(0).isVector())
12283 if (VUse->getOpcode() == VectorwiseOp)
12289 if (!
any_of(
N->uses(), usesCombinedOperand))
12295 if (LHSMask != ~0u && RHSMask != ~0u) {
12298 if (LHSMask > RHSMask) {
12305 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12306 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12309 if (!(LHSUsedLanes & RHSUsedLanes) &&
12312 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12314 LHSMask &= ~RHSUsedLanes;
12315 RHSMask &= ~LHSUsedLanes;
12317 LHSMask |= LHSUsedLanes & 0x04040404;
12323 LHS.getOperand(0),
RHS.getOperand(0),
12327 if (LHSMask == ~0u || RHSMask == ~0u) {
12333 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12348 if (SrcVT == MVT::i32) {
12354 DCI.AddToWorklist(LowOr.
getNode());
12355 DCI.AddToWorklist(HiBits.
getNode());
12363 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12367 N->getOperand(0), CRHS))
12375 DAGCombinerInfo &DCI)
const {
12376 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12385 EVT VT =
N->getValueType(0);
12386 if (CRHS && VT == MVT::i64) {
12408 LHS->getOperand(0), FNegLHS, FNegRHS);
12417 DAGCombinerInfo &DCI)
const {
12422 EVT VT =
N->getValueType(0);
12423 if (VT != MVT::i32)
12427 if (Src.getValueType() != MVT::i16)
12434SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12435 DAGCombinerInfo &DCI)
const {
12437 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12442 VTSign->getVT() == MVT::i8) ||
12444 VTSign->getVT() == MVT::i16))) {
12446 "s_buffer_load_{u8, i8} are supported "
12447 "in GFX12 (or newer) architectures.");
12448 EVT VT = Src.getValueType();
12453 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12459 auto *
M = cast<MemSDNode>(Src);
12460 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12461 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12465 VTSign->getVT() == MVT::i8) ||
12467 VTSign->getVT() == MVT::i16)) &&
12469 auto *
M = cast<MemSDNode>(Src);
12481 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12482 Src.getOperand(0).getValueType());
12485 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12487 Ops,
M->getMemoryVT(),
12488 M->getMemOperand());
12489 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12496 DAGCombinerInfo &DCI)
const {
12504 if (
N->getOperand(0).isUndef())
12511 DAGCombinerInfo &DCI)
const {
12512 EVT VT =
N->getValueType(0);
12516 return DCI.DAG.getConstantFP(
12539 unsigned Opcode =
Op.getOpcode();
12543 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12544 const auto &
F = CFP->getValueAPF();
12545 if (
F.isNaN() &&
F.isSignaling())
12547 if (!
F.isDenormal())
12610 if (
Op.getValueType() == MVT::i32) {
12615 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12616 if (
RHS->getZExtValue() == 0xffff0000) {
12626 return Op.getValueType().getScalarType() != MVT::f16;
12694 if (
Op.getValueType() == MVT::i16) {
12705 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12707 switch (IntrinsicID) {
12708 case Intrinsic::amdgcn_cvt_pkrtz:
12709 case Intrinsic::amdgcn_cubeid:
12710 case Intrinsic::amdgcn_frexp_mant:
12711 case Intrinsic::amdgcn_fdot2:
12712 case Intrinsic::amdgcn_rcp:
12713 case Intrinsic::amdgcn_rsq:
12714 case Intrinsic::amdgcn_rsq_clamp:
12715 case Intrinsic::amdgcn_rcp_legacy:
12716 case Intrinsic::amdgcn_rsq_legacy:
12717 case Intrinsic::amdgcn_trig_preop:
12718 case Intrinsic::amdgcn_log:
12719 case Intrinsic::amdgcn_exp2:
12720 case Intrinsic::amdgcn_sqrt:
12741 unsigned Opcode =
MI->getOpcode();
12743 if (Opcode == AMDGPU::G_FCANONICALIZE)
12746 std::optional<FPValueAndVReg> FCR;
12749 if (FCR->Value.isSignaling())
12751 if (!FCR->Value.isDenormal())
12762 case AMDGPU::G_FADD:
12763 case AMDGPU::G_FSUB:
12764 case AMDGPU::G_FMUL:
12765 case AMDGPU::G_FCEIL:
12766 case AMDGPU::G_FFLOOR:
12767 case AMDGPU::G_FRINT:
12768 case AMDGPU::G_FNEARBYINT:
12769 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12770 case AMDGPU::G_INTRINSIC_TRUNC:
12771 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12772 case AMDGPU::G_FMA:
12773 case AMDGPU::G_FMAD:
12774 case AMDGPU::G_FSQRT:
12775 case AMDGPU::G_FDIV:
12776 case AMDGPU::G_FREM:
12777 case AMDGPU::G_FPOW:
12778 case AMDGPU::G_FPEXT:
12779 case AMDGPU::G_FLOG:
12780 case AMDGPU::G_FLOG2:
12781 case AMDGPU::G_FLOG10:
12782 case AMDGPU::G_FPTRUNC:
12783 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12784 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12785 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12786 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12787 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12789 case AMDGPU::G_FNEG:
12790 case AMDGPU::G_FABS:
12791 case AMDGPU::G_FCOPYSIGN:
12793 case AMDGPU::G_FMINNUM:
12794 case AMDGPU::G_FMAXNUM:
12795 case AMDGPU::G_FMINNUM_IEEE:
12796 case AMDGPU::G_FMAXNUM_IEEE:
12797 case AMDGPU::G_FMINIMUM:
12798 case AMDGPU::G_FMAXIMUM: {
12806 case AMDGPU::G_BUILD_VECTOR:
12811 case AMDGPU::G_INTRINSIC:
12812 case AMDGPU::G_INTRINSIC_CONVERGENT:
12814 case Intrinsic::amdgcn_fmul_legacy:
12815 case Intrinsic::amdgcn_fmad_ftz:
12816 case Intrinsic::amdgcn_sqrt:
12817 case Intrinsic::amdgcn_fmed3:
12818 case Intrinsic::amdgcn_sin:
12819 case Intrinsic::amdgcn_cos:
12820 case Intrinsic::amdgcn_log:
12821 case Intrinsic::amdgcn_exp2:
12822 case Intrinsic::amdgcn_log_clamp:
12823 case Intrinsic::amdgcn_rcp:
12824 case Intrinsic::amdgcn_rcp_legacy:
12825 case Intrinsic::amdgcn_rsq:
12826 case Intrinsic::amdgcn_rsq_clamp:
12827 case Intrinsic::amdgcn_rsq_legacy:
12828 case Intrinsic::amdgcn_div_scale:
12829 case Intrinsic::amdgcn_div_fmas:
12830 case Intrinsic::amdgcn_div_fixup:
12831 case Intrinsic::amdgcn_fract:
12832 case Intrinsic::amdgcn_cvt_pkrtz:
12833 case Intrinsic::amdgcn_cubeid:
12834 case Intrinsic::amdgcn_cubema:
12835 case Intrinsic::amdgcn_cubesc:
12836 case Intrinsic::amdgcn_cubetc:
12837 case Intrinsic::amdgcn_frexp_mant:
12838 case Intrinsic::amdgcn_fdot2:
12839 case Intrinsic::amdgcn_trig_preop:
12854SDValue SITargetLowering::getCanonicalConstantFP(
12857 if (
C.isDenormal()) {
12871 if (
C.isSignaling()) {
12890 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
12893SDValue SITargetLowering::performFCanonicalizeCombine(
12895 DAGCombinerInfo &DCI)
const {
12898 EVT VT =
N->getValueType(0);
12907 EVT VT =
N->getValueType(0);
12908 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
12924 EVT EltVT =
Lo.getValueType();
12927 for (
unsigned I = 0;
I != 2; ++
I) {
12930 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
12931 CFP->getValueAPF());
12932 }
else if (
Op.isUndef()) {
12944 if (isa<ConstantFPSDNode>(NewElts[1]))
12945 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12950 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13001 if (!MinK || !MaxK)
13014 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13015 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13057 if (
Info->getMode().DX10Clamp) {
13066 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13088 DAGCombinerInfo &DCI)
const {
13091 EVT VT =
N->getValueType(0);
13092 unsigned Opc =
N->getOpcode();
13101 (VT == MVT::i32 || VT == MVT::f32 ||
13102 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->
hasMin3Max3_16()))) {
13109 N->getValueType(0),
13122 N->getValueType(0),
13132 if (
SDValue Med3 = performIntMed3ImmCombine(
13137 if (
SDValue Med3 = performIntMed3ImmCombine(
13143 if (
SDValue Med3 = performIntMed3ImmCombine(
13148 if (
SDValue Med3 = performIntMed3ImmCombine(
13158 (VT == MVT::f32 || VT == MVT::f64 ||
13162 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13173 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13174 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13183 DAGCombinerInfo &DCI)
const {
13184 EVT VT =
N->getValueType(0);
13207 if (
Info->getMode().DX10Clamp) {
13210 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13213 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13216 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13227 DAGCombinerInfo &DCI)
const {
13231 return DCI.DAG.getUNDEF(
N->getValueType(0));
13239 bool IsDivergentIdx,
13244 unsigned VecSize = EltSize * NumElem;
13247 if (VecSize <= 64 && EltSize < 32)
13256 if (IsDivergentIdx)
13260 unsigned NumInsts = NumElem +
13261 ((EltSize + 31) / 32) * NumElem ;
13266 return NumInsts <= 16;
13270 return NumInsts <= 15;
13275 if (isa<ConstantSDNode>(
Idx))
13288SDValue SITargetLowering::performExtractVectorEltCombine(
13289 SDNode *
N, DAGCombinerInfo &DCI)
const {
13295 EVT ResVT =
N->getValueType(0);
13314 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13342 DCI.AddToWorklist(Elt0.
getNode());
13343 DCI.AddToWorklist(Elt1.
getNode());
13365 if (!DCI.isBeforeLegalize())
13371 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13372 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13373 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13376 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13377 unsigned EltIdx = BitIndex / 32;
13378 unsigned LeftoverBitIdx = BitIndex % 32;
13382 DCI.AddToWorklist(Cast.
getNode());
13386 DCI.AddToWorklist(Elt.
getNode());
13389 DCI.AddToWorklist(Srl.
getNode());
13393 DCI.AddToWorklist(Trunc.
getNode());
13395 if (VecEltVT == ResVT) {
13407SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13408 DAGCombinerInfo &DCI)
const {
13422 EVT IdxVT =
Idx.getValueType();
13439 Src.getOperand(0).getValueType() == MVT::f16) {
13440 return Src.getOperand(0);
13443 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13444 APFloat Val = CFP->getValueAPF();
13445 bool LosesInfo =
true;
13455 DAGCombinerInfo &DCI)
const {
13457 "combine only useful on gfx8");
13459 SDValue TruncSrc =
N->getOperand(0);
13460 EVT VT =
N->getValueType(0);
13461 if (VT != MVT::f16)
13499unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13501 const SDNode *N1)
const {
13506 if (((VT == MVT::f32 &&
13508 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13528 EVT VT =
N->getValueType(0);
13529 if (VT != MVT::i32 && VT != MVT::i64)
13535 unsigned Opc =
N->getOpcode();
13558 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13580 DAGCombinerInfo &DCI)
const {
13584 EVT VT =
N->getValueType(0);
13594 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13598 if (NumBits <= 32 || NumBits > 64)
13610 unsigned NumUsers = 0;
13635 bool MulSignedLo =
false;
13636 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13645 if (VT != MVT::i64) {
13668 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13670 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13672 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13674 if (!MulLHSUnsigned32) {
13681 if (!MulRHSUnsigned32) {
13692 if (VT != MVT::i64)
13699static std::optional<ByteProvider<SDValue>>
13702 if (!Byte0 || Byte0->isConstantZero()) {
13703 return std::nullopt;
13706 if (Byte1 && !Byte1->isConstantZero()) {
13707 return std::nullopt;
13713 unsigned FirstCs =
First & 0x0c0c0c0c;
13714 unsigned SecondCs = Second & 0x0c0c0c0c;
13715 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13716 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13718 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13719 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13720 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13721 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13723 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13747 for (
int BPI = 0; BPI < 2; BPI++) {
13750 BPP = {Src1, Src0};
13752 unsigned ZeroMask = 0x0c0c0c0c;
13753 unsigned FMask = 0xFF << (8 * (3 - Step));
13755 unsigned FirstMask =
13756 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13757 unsigned SecondMask =
13758 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13762 int FirstGroup = -1;
13763 for (
int I = 0;
I < 2;
I++) {
13765 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
13766 return IterElt.SrcOp == *BPP.first.Src &&
13767 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13777 if (FirstGroup != -1) {
13779 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
13780 return IterElt.SrcOp == *BPP.second.Src &&
13781 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13787 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13795 unsigned ZeroMask = 0x0c0c0c0c;
13796 unsigned FMask = 0xFF << (8 * (3 - Step));
13800 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13804 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13815 if (Srcs.
size() == 1) {
13816 auto Elt = Srcs.
begin();
13820 if (Elt->PermMask == 0x3020100)
13827 auto FirstElt = Srcs.
begin();
13828 auto SecondElt = std::next(FirstElt);
13835 auto FirstMask = FirstElt->PermMask;
13836 auto SecondMask = SecondElt->PermMask;
13838 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13839 unsigned FirstPlusFour = FirstMask | 0x04040404;
13842 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13854 FirstElt = std::next(SecondElt);
13855 if (FirstElt == Srcs.
end())
13858 SecondElt = std::next(FirstElt);
13861 if (SecondElt == Srcs.
end()) {
13867 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
13873 return Perms.
size() == 2
13879 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13880 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13881 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13882 EntryMask += ZeroMask;
13887 auto Opcode =
Op.getOpcode();
13893static std::optional<bool>
13904 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13907 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13909 assert(!(S0IsUnsigned && S0IsSigned));
13910 assert(!(S1IsUnsigned && S1IsSigned));
13918 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13924 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13925 return std::nullopt;
13937 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13938 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13943 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13949 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13950 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13951 return std::nullopt;
13957 DAGCombinerInfo &DCI)
const {
13959 EVT VT =
N->getValueType(0);
13966 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
13971 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
13978 std::optional<bool> IsSigned;
13984 int ChainLength = 0;
13985 for (
int I = 0;
I < 4;
I++) {
13986 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
13989 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
13992 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
13997 TempNode->getOperand(MulIdx), *Src0, *Src1,
13998 TempNode->getOperand(MulIdx)->getOperand(0),
13999 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14003 IsSigned = *IterIsSigned;
14004 if (*IterIsSigned != *IsSigned)
14007 auto AddIdx = 1 - MulIdx;
14010 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14011 Src2s.
push_back(TempNode->getOperand(AddIdx));
14021 TempNode->getOperand(AddIdx), *Src0, *Src1,
14022 TempNode->getOperand(AddIdx)->getOperand(0),
14023 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14027 if (*IterIsSigned != *IsSigned)
14031 ChainLength =
I + 2;
14035 TempNode = TempNode->getOperand(AddIdx);
14037 ChainLength =
I + 1;
14038 if (TempNode->getNumOperands() < 2)
14040 LHS = TempNode->getOperand(0);
14041 RHS = TempNode->getOperand(1);
14044 if (ChainLength < 2)
14050 if (ChainLength < 4) {
14060 bool UseOriginalSrc =
false;
14061 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14062 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14063 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14064 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14066 auto Src0Mask = Src0s.
begin()->PermMask;
14067 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14068 bool UniqueEntries =
true;
14069 for (
auto I = 1;
I < 4;
I++) {
14070 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14073 UniqueEntries =
false;
14079 if (UniqueEntries) {
14080 UseOriginalSrc =
true;
14082 auto FirstElt = Src0s.
begin();
14086 auto SecondElt = Src1s.
begin();
14088 SecondElt->DWordOffset);
14097 if (!UseOriginalSrc) {
14104 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14107 : Intrinsic::amdgcn_udot4,
14117 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14122 unsigned Opc =
LHS.getOpcode();
14127 Opc =
RHS.getOpcode();
14133 auto Cond =
RHS.getOperand(0);
14141 return DAG.
getNode(Opc, SL, VTList, Args);
14155 DAGCombinerInfo &DCI)
const {
14157 EVT VT =
N->getValueType(0);
14159 if (VT != MVT::i32)
14168 unsigned Opc =
RHS.getOpcode();
14174 auto Cond =
RHS.getOperand(0);
14182 return DAG.
getNode(Opc, SL, VTList, Args);
14196SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14197 DAGCombinerInfo &DCI)
const {
14199 if (
N->getValueType(0) != MVT::i32)
14210 unsigned LHSOpc =
LHS.getOpcode();
14211 unsigned Opc =
N->getOpcode();
14221 DAGCombinerInfo &DCI)
const {
14226 EVT VT =
N->getValueType(0);
14238 if (
A ==
LHS.getOperand(1)) {
14239 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14240 if (FusedOp != 0) {
14242 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14250 if (
A ==
RHS.getOperand(1)) {
14251 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14252 if (FusedOp != 0) {
14254 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14263 DAGCombinerInfo &DCI)
const {
14269 EVT VT =
N->getValueType(0);
14282 if (
A ==
LHS.getOperand(1)) {
14283 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14288 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14297 if (
A ==
RHS.getOperand(1)) {
14298 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14301 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14310 DAGCombinerInfo &DCI)
const {
14313 EVT VT =
N->getValueType(0);
14327 bool IsNegative =
false;
14328 if (CLHS->isExactlyValue(1.0) ||
14329 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14345 DAGCombinerInfo &DCI)
const {
14347 EVT VT =
N->getValueType(0);
14369 (
N->getFlags().hasAllowContract() &&
14370 FMA->getFlags().hasAllowContract())) {
14404 if (Vec1 == Vec2 || Vec3 == Vec4)
14410 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14411 (Vec1 == Vec4 && Vec2 == Vec3)) {
14420 DAGCombinerInfo &DCI)
const {
14426 EVT VT =
LHS.getValueType();
14429 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14431 CRHS = dyn_cast<ConstantSDNode>(LHS);
14455 return LHS.getOperand(0);
14461 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14462 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14463 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14470 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14471 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14479 return LHS.getOperand(0);
14483 if (VT != MVT::f32 && VT != MVT::f64 &&
14516 DAGCombinerInfo &DCI)
const {
14534 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14538 unsigned ShiftOffset = 8 *
Offset;
14540 ShiftOffset -=
C->getZExtValue();
14542 ShiftOffset +=
C->getZExtValue();
14544 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14546 MVT::f32, Shifted);
14557 DCI.AddToWorklist(
N);
14564 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14570 DAGCombinerInfo &DCI)
const {
14580 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14583 APFloat One(
F.getSemantics(),
"1.0");
14585 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14595 switch (
N->getOpcode()) {
14597 return performAddCombine(
N, DCI);
14599 return performSubCombine(
N, DCI);
14602 return performAddCarrySubCarryCombine(
N, DCI);
14604 return performFAddCombine(
N, DCI);
14606 return performFSubCombine(
N, DCI);
14608 return performFDivCombine(
N, DCI);
14610 return performSetCCCombine(
N, DCI);
14623 return performMinMaxCombine(
N, DCI);
14625 return performFMACombine(
N, DCI);
14627 return performAndCombine(
N, DCI);
14629 return performOrCombine(
N, DCI);
14632 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14633 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14639 return performXorCombine(
N, DCI);
14641 return performZeroExtendCombine(
N, DCI);
14643 return performSignExtendInRegCombine(
N , DCI);
14645 return performClassCombine(
N, DCI);
14647 return performFCanonicalizeCombine(
N, DCI);
14649 return performRcpCombine(
N, DCI);
14664 return performUCharToFloatCombine(
N, DCI);
14666 return performFCopySignCombine(
N, DCI);
14671 return performCvtF32UByteNCombine(
N, DCI);
14673 return performFMed3Combine(
N, DCI);
14675 return performCvtPkRTZCombine(
N, DCI);
14677 return performClampCombine(
N, DCI);
14680 EVT VT =
N->getValueType(0);
14683 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14686 EVT EltVT = Src.getValueType();
14687 if (EltVT != MVT::i16)
14697 return performExtractVectorEltCombine(
N, DCI);
14699 return performInsertVectorEltCombine(
N, DCI);
14701 return performFPRoundCombine(
N, DCI);
14703 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14709 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14710 return performMemSDNodeCombine(MemNode, DCI);
14723 default:
return ~0u;
14724 case AMDGPU::sub0:
return 0;
14725 case AMDGPU::sub1:
return 1;
14726 case AMDGPU::sub2:
return 2;
14727 case AMDGPU::sub3:
return 3;
14728 case AMDGPU::sub4:
return 4;
14735 unsigned Opcode =
Node->getMachineOpcode();
14739 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14745 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14746 unsigned NewDmask = 0;
14749 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14750 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14753 unsigned TFCLane = 0;
14754 bool HasChain =
Node->getNumValues() > 1;
14756 if (OldDmask == 0) {
14764 TFCLane = OldBitsSet;
14772 if (
I.getUse().getResNo() != 0)
14776 if (!
I->isMachineOpcode() ||
14777 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14789 if (UsesTFC && Lane == TFCLane) {
14794 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14796 Dmask &= ~(1 << Comp);
14804 NewDmask |= 1 << Comp;
14809 bool NoChannels = !NewDmask;
14816 if (OldBitsSet == 1)
14822 if (NewDmask == OldDmask)
14831 unsigned NewChannels = BitsSet + UsesTFC;
14835 assert(NewOpcode != -1 &&
14836 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14837 "failed to find equivalent MIMG op");
14845 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14847 MVT ResultVT = NewChannels == 1 ?
14849 NewChannels == 5 ? 8 : NewChannels);
14863 if (NewChannels == 1) {
14873 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14878 if (i || !NoChannels)
14883 if (NewUser !=
User) {
14891 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
14892 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
14893 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
14894 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
14904 Op =
Op.getOperand(0);
14906 return isa<FrameIndexSDNode>(
Op);
14915 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14916 SDValue SrcVal = Node->getOperand(2);
14924 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14926 SDNode *Glued = Node->getGluedNode();
14928 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14935 return ToResultReg.
getNode();
14940 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
14948 Node->getOperand(i).getValueType(),
14949 Node->getOperand(i)), 0));
14960 unsigned Opcode = Node->getMachineOpcode();
14962 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
14963 !
TII->isGather4(Opcode) &&
14965 return adjustWritemask(Node, DAG);
14968 if (Opcode == AMDGPU::INSERT_SUBREG ||
14969 Opcode == AMDGPU::REG_SEQUENCE) {
14975 case AMDGPU::V_DIV_SCALE_F32_e64:
14976 case AMDGPU::V_DIV_SCALE_F64_e64: {
14980 SDValue Src0 = Node->getOperand(1);
14981 SDValue Src1 = Node->getOperand(3);
14982 SDValue Src2 = Node->getOperand(5);
14986 (Src0 == Src1 || Src0 == Src2))
15043 unsigned InitIdx = 0;
15045 if (
TII->isImage(
MI)) {
15053 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15054 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15055 unsigned D16Val = D16 ? D16->getImm() : 0;
15057 if (!TFEVal && !LWEVal)
15068 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15070 unsigned dmask = MO_Dmask->
getImm();
15077 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15083 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15084 if (DstSize < InitIdx)
15087 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15095 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15096 unsigned NewDst = 0;
15105 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15106 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15124 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15137 if (
TII->isVOP3(
MI.getOpcode())) {
15139 TII->legalizeOperandsVOP3(
MRI,
MI);
15144 if (!
MI.getDesc().operands().empty()) {
15145 unsigned Opc =
MI.getOpcode();
15146 bool HasAGPRs =
Info->mayNeedAGPRs();
15154 if ((
I == Src2Idx) && (HasAGPRs))
15157 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15159 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15160 if (!
TRI->hasAGPRs(RC))
15162 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15163 if (!Src || !Src->isCopy() ||
15164 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15166 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15170 MRI.setRegClass(
Op.getReg(), NewRC);
15177 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15178 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15179 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15180 if (
TRI->isVectorSuperClass(RC)) {
15181 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15182 MRI.setRegClass(Src2->getReg(), NewRC);
15183 if (Src2->isTied())
15184 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15193 if (
TII->isImage(
MI))
15194 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15220 MVT::v2i32, Ops0), 0);
15250 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15272std::pair<unsigned, const TargetRegisterClass *>
15279 if (Constraint.
size() == 1) {
15281 switch (Constraint[0]) {
15288 RC = &AMDGPU::SReg_32RegClass;
15291 RC = &AMDGPU::SGPR_64RegClass;
15296 return std::pair(0U,
nullptr);
15303 RC = &AMDGPU::VGPR_32RegClass;
15308 return std::pair(0U,
nullptr);
15317 RC = &AMDGPU::AGPR_32RegClass;
15322 return std::pair(0U,
nullptr);
15331 return std::pair(0U, RC);
15336 if (
RegName.consume_front(
"v")) {
15337 RC = &AMDGPU::VGPR_32RegClass;
15338 }
else if (
RegName.consume_front(
"s")) {
15339 RC = &AMDGPU::SGPR_32RegClass;
15340 }
else if (
RegName.consume_front(
"a")) {
15341 RC = &AMDGPU::AGPR_32RegClass;
15346 if (
RegName.consume_front(
"[")) {
15356 RC =
TRI->getVGPRClassForBitWidth(Width);
15358 RC =
TRI->getSGPRClassForBitWidth(Width);
15360 RC =
TRI->getAGPRClassForBitWidth(Width);
15362 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15363 return std::pair(Reg, RC);
15368 if (!
Failed && Idx < RC->getNumRegs())
15376 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15382 if (Constraint.
size() == 1) {
15383 switch (Constraint[0]) {
15392 }
else if (Constraint ==
"DA" ||
15393 Constraint ==
"DB") {
15401 if (Constraint.
size() == 1) {
15402 switch (Constraint[0]) {
15418 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15425 std::vector<SDValue> &Ops,
15440 unsigned Size =
Op.getScalarValueSizeInBits();
15448 Val =
C->getSExtValue();
15452 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15458 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15461 Val =
C->getSExtValue();
15465 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15475 if (Constraint.
size() == 1) {
15476 switch (Constraint[0]) {
15480 return isInt<16>(Val);
15484 return isInt<32>(Val);
15491 }
else if (Constraint.
size() == 2) {
15492 if (Constraint ==
"DA") {
15493 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15494 int64_t LoBits =
static_cast<int32_t
>(Val);
15498 if (Constraint ==
"DB") {
15506 unsigned MaxSize)
const {
15507 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15510 MVT VT =
Op.getSimpleValueType();
15535 switch (UnalignedClassID) {
15536 case AMDGPU::VReg_64RegClassID:
15537 return AMDGPU::VReg_64_Align2RegClassID;
15538 case AMDGPU::VReg_96RegClassID:
15539 return AMDGPU::VReg_96_Align2RegClassID;
15540 case AMDGPU::VReg_128RegClassID:
15541 return AMDGPU::VReg_128_Align2RegClassID;
15542 case AMDGPU::VReg_160RegClassID:
15543 return AMDGPU::VReg_160_Align2RegClassID;
15544 case AMDGPU::VReg_192RegClassID:
15545 return AMDGPU::VReg_192_Align2RegClassID;
15546 case AMDGPU::VReg_224RegClassID:
15547 return AMDGPU::VReg_224_Align2RegClassID;
15548 case AMDGPU::VReg_256RegClassID:
15549 return AMDGPU::VReg_256_Align2RegClassID;
15550 case AMDGPU::VReg_288RegClassID:
15551 return AMDGPU::VReg_288_Align2RegClassID;
15552 case AMDGPU::VReg_320RegClassID:
15553 return AMDGPU::VReg_320_Align2RegClassID;
15554 case AMDGPU::VReg_352RegClassID:
15555 return AMDGPU::VReg_352_Align2RegClassID;
15556 case AMDGPU::VReg_384RegClassID:
15557 return AMDGPU::VReg_384_Align2RegClassID;
15558 case AMDGPU::VReg_512RegClassID:
15559 return AMDGPU::VReg_512_Align2RegClassID;
15560 case AMDGPU::VReg_1024RegClassID:
15561 return AMDGPU::VReg_1024_Align2RegClassID;
15562 case AMDGPU::AReg_64RegClassID:
15563 return AMDGPU::AReg_64_Align2RegClassID;
15564 case AMDGPU::AReg_96RegClassID:
15565 return AMDGPU::AReg_96_Align2RegClassID;
15566 case AMDGPU::AReg_128RegClassID:
15567 return AMDGPU::AReg_128_Align2RegClassID;
15568 case AMDGPU::AReg_160RegClassID:
15569 return AMDGPU::AReg_160_Align2RegClassID;
15570 case AMDGPU::AReg_192RegClassID:
15571 return AMDGPU::AReg_192_Align2RegClassID;
15572 case AMDGPU::AReg_256RegClassID:
15573 return AMDGPU::AReg_256_Align2RegClassID;
15574 case AMDGPU::AReg_512RegClassID:
15575 return AMDGPU::AReg_512_Align2RegClassID;
15576 case AMDGPU::AReg_1024RegClassID:
15577 return AMDGPU::AReg_1024_Align2RegClassID;
15593 if (
Info->isEntryFunction()) {
15600 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15602 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15603 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15604 &AMDGPU::SGPR_64RegClass);
15605 Info->setSGPRForEXECCopy(SReg);
15608 Info->getStackPtrOffsetReg()));
15609 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15610 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15614 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15615 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15617 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15618 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15620 Info->limitOccupancy(MF);
15622 if (ST.isWave32() && !MF.
empty()) {
15623 for (
auto &
MBB : MF) {
15624 for (
auto &
MI :
MBB) {
15625 TII->fixImplicitOperands(
MI);
15635 if (ST.needsAlignedVGPRs()) {
15636 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
15642 if (NewClassID != -1)
15643 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15652 const APInt &DemandedElts,
15654 unsigned Depth)
const {
15656 unsigned Opc =
Op.getOpcode();
15659 unsigned IID =
Op.getConstantOperandVal(0);
15661 case Intrinsic::amdgcn_mbcnt_lo:
15662 case Intrinsic::amdgcn_mbcnt_hi: {
15669 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15671 MaxActiveBits += Src1ValBits ? 1 : 0;
15672 unsigned Size =
Op.getValueType().getSizeInBits();
15673 if (MaxActiveBits <
Size)
15682 Op, Known, DemandedElts, DAG,
Depth);
15697 unsigned MaxValue =
15706 switch (
MI->getOpcode()) {
15707 case AMDGPU::G_INTRINSIC:
15708 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15710 case Intrinsic::amdgcn_workitem_id_x:
15713 case Intrinsic::amdgcn_workitem_id_y:
15716 case Intrinsic::amdgcn_workitem_id_z:
15719 case Intrinsic::amdgcn_mbcnt_lo:
15720 case Intrinsic::amdgcn_mbcnt_hi: {
15722 unsigned Size =
MRI.getType(R).getSizeInBits();
15726 case Intrinsic::amdgcn_groupstaticsize: {
15737 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15740 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15743 case AMDGPU::G_AMDGPU_SMED3:
15744 case AMDGPU::G_AMDGPU_UMED3: {
15745 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15772 unsigned Depth)
const {
15774 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15780 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15807 if (Header->getAlignment() != PrefAlign)
15808 return Header->getAlignment();
15810 unsigned LoopSize = 0;
15818 LoopSize +=
TII->getInstSizeInBytes(
MI);
15819 if (LoopSize > 192)
15824 if (LoopSize <= 64)
15827 if (LoopSize <= 128)
15828 return CacheLineAlign;
15834 auto I = Exit->getFirstNonDebugInstr();
15835 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15836 return CacheLineAlign;
15845 if (PreTerm == Pre->
begin() ||
15846 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15850 auto ExitHead = Exit->getFirstNonDebugInstr();
15851 if (ExitHead == Exit->end() ||
15852 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15857 return CacheLineAlign;
15865 N =
N->getOperand(0).getNode();
15876 switch (
N->getOpcode()) {
15884 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15885 return !
TRI->isSGPRReg(
MRI, Reg);
15891 return !
TRI->isSGPRReg(
MRI, Reg);
15895 unsigned AS = L->getAddressSpace();
15929 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
15931 return A->readMem() &&
A->writeMem();
15966 unsigned Depth)
const {
15971 if (
Info->getMode().DX10Clamp)
15995 return F->getFnAttribute(
"amdgpu-unsafe-fp-atomics").getValueAsString() !=
16008 <<
"Hardware instruction generated for atomic "
16010 <<
" operation at memory scope " << MemScope;
16028 bool HasSystemScope =
16065 if (HasSystemScope)
16136 if (HasSystemScope)
16173 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16175 : &AMDGPU::SReg_32RegClass;
16176 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16177 return TRI->getEquivalentSGPRClass(RC);
16178 else if (
TRI->isSGPRClass(RC) && isDivergent)
16179 return TRI->getEquivalentVGPRClass(RC);
16191 unsigned WaveSize) {
16196 if (!
IT ||
IT->getBitWidth() != WaveSize)
16199 if (!isa<Instruction>(V))
16201 if (!Visited.
insert(V).second)
16203 bool Result =
false;
16204 for (
const auto *U : V->users()) {
16205 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16206 if (V == U->getOperand(1)) {
16207 switch (Intrinsic->getIntrinsicID()) {
16211 case Intrinsic::amdgcn_if_break:
16212 case Intrinsic::amdgcn_if:
16213 case Intrinsic::amdgcn_else:
16218 if (V == U->getOperand(0)) {
16219 switch (Intrinsic->getIntrinsicID()) {
16223 case Intrinsic::amdgcn_end_cf:
16224 case Intrinsic::amdgcn_loop:
16230 Result =
hasCFUser(U, Visited, WaveSize);
16239 const Value *V)
const {
16240 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16241 if (CI->isInlineAsm()) {
16250 for (
auto &TC : TargetConstraints) {
16254 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16267 for (;
I != E; ++
I) {
16268 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16291 return MRI.hasOneNonDBGUse(N0);
16298 if (
I.getMetadata(
"amdgpu.noclobber"))
16300 if (
I.getMetadata(
"amdgpu.last.use"))
16310 if (!Def->isMachineOpcode())
16321 PhysReg = AMDGPU::SCC;
16323 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16337 "this cannot be replaced with add");
16343 "target should have atomic fadd instructions");
16346 "generic atomicrmw expansion only supports FP32 operand in flat "
16420 for (
auto &
P : MDs)
16431 {
Addr},
nullptr,
"is.shared");
16432 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16437 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16442 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16448 Value *LoadedPrivate =
16449 Builder.
CreateLoad(ValTy, CastToPrivate,
"loaded.private");
16457 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
bool hasD16Images() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasLDSFPAtomicAdd() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
bool hasScalarDwordx3Loads() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
LLVMContext & getContext() const
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
bool isCompare() const
Return true if this instruction is a comparison.
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ BUFFER_ATOMIC_FADD_BF16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const