40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
277 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
284 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
285 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
286 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
289 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
290 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
291 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
295 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
296 MVT::v3i16, MVT::v4i16, MVT::Other},
301 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
317 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
318 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
319 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
320 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
321 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
322 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
323 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
324 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
356 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
370 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
384 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
398 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
412 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
427 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
436 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
437 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
442 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
446 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
447 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
448 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
449 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
530 {MVT::f32, MVT::f64},
Legal);
623 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
624 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
625 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
750 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
761 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
769 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
770 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
792 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
793 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
796 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
804 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
820 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
840 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
841 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
842 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
843 MVT::v32f16, MVT::v32bf16},
859 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
862 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
863 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
867 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
868 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
869 MVT::i16, MVT::i8, MVT::i128},
873 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
874 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
969 EVT DestVT,
EVT SrcVT)
const {
979 LLT DestTy,
LLT SrcTy)
const {
980 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
981 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1007 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1009 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1036 return (NumElts + 1) / 2;
1042 return NumElts * ((
Size + 31) / 32);
1051 EVT VT,
EVT &IntermediateVT,
1052 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1061 if (ScalarVT == MVT::bf16) {
1062 RegisterVT = MVT::i32;
1063 IntermediateVT = MVT::v2bf16;
1065 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1066 IntermediateVT = RegisterVT;
1068 NumIntermediates = (NumElts + 1) / 2;
1069 return NumIntermediates;
1074 IntermediateVT = RegisterVT;
1075 NumIntermediates = NumElts;
1076 return NumIntermediates;
1079 if (Size < 16 && Subtarget->has16BitInsts()) {
1081 RegisterVT = MVT::i16;
1082 IntermediateVT = ScalarVT;
1083 NumIntermediates = NumElts;
1084 return NumIntermediates;
1089 RegisterVT = MVT::i32;
1090 IntermediateVT = ScalarVT;
1091 NumIntermediates = NumElts;
1092 return NumIntermediates;
1096 RegisterVT = MVT::i32;
1097 IntermediateVT = RegisterVT;
1098 NumIntermediates = NumElts * ((
Size + 31) / 32);
1099 return NumIntermediates;
1104 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1108 assert(MaxNumLanes != 0);
1110 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1111 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1122 auto *ST = dyn_cast<StructType>(Ty);
1127 assert(ST->getNumContainedTypes() == 2 &&
1128 ST->getContainedType(1)->isIntegerTy(32));
1143 DL.getPointerSizeInBits(AS) == 192)
1153 DL.getPointerSizeInBits(AS) == 160) ||
1155 DL.getPointerSizeInBits(AS) == 192))
1163 unsigned IntrID)
const {
1165 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1179 if (RsrcIntr->IsImage)
1183 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1190 Info.ptrVal = RsrcArg;
1198 unsigned MaxNumLanes = 4;
1200 if (RsrcIntr->IsImage) {
1224 if (RsrcIntr->IsImage) {
1225 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1246 case Intrinsic::amdgcn_raw_buffer_load_lds:
1247 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1248 case Intrinsic::amdgcn_struct_buffer_load_lds:
1249 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1250 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1261 case Intrinsic::amdgcn_ds_ordered_add:
1262 case Intrinsic::amdgcn_ds_ordered_swap:
1263 case Intrinsic::amdgcn_ds_fadd:
1264 case Intrinsic::amdgcn_ds_fmin:
1265 case Intrinsic::amdgcn_ds_fmax: {
1278 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1286 if (!Vol || !Vol->
isZero())
1291 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1292 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1295 Info.ptrVal =
nullptr;
1300 case Intrinsic::amdgcn_ds_append:
1301 case Intrinsic::amdgcn_ds_consume: {
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1324 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1334 case Intrinsic::amdgcn_global_atomic_fadd:
1335 case Intrinsic::amdgcn_global_atomic_fmin:
1336 case Intrinsic::amdgcn_global_atomic_fmax:
1337 case Intrinsic::amdgcn_global_atomic_fmin_num:
1338 case Intrinsic::amdgcn_global_atomic_fmax_num:
1339 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1340 case Intrinsic::amdgcn_flat_atomic_fadd:
1341 case Intrinsic::amdgcn_flat_atomic_fmin:
1342 case Intrinsic::amdgcn_flat_atomic_fmax:
1343 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1344 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1345 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1346 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1347 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1358 case Intrinsic::amdgcn_global_load_tr: {
1366 case Intrinsic::amdgcn_ds_gws_init:
1367 case Intrinsic::amdgcn_ds_gws_barrier:
1368 case Intrinsic::amdgcn_ds_gws_sema_v:
1369 case Intrinsic::amdgcn_ds_gws_sema_br:
1370 case Intrinsic::amdgcn_ds_gws_sema_p:
1371 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1381 Info.memVT = MVT::i32;
1385 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1391 case Intrinsic::amdgcn_global_load_lds: {
1393 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1399 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1409 Info.memVT = MVT::i32;
1424 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1427 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1428 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1440 Type *&AccessTy)
const {
1443 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1444 case Intrinsic::amdgcn_ds_append:
1445 case Intrinsic::amdgcn_ds_consume:
1446 case Intrinsic::amdgcn_ds_fadd:
1447 case Intrinsic::amdgcn_ds_fmax:
1448 case Intrinsic::amdgcn_ds_fmin:
1449 case Intrinsic::amdgcn_ds_ordered_add:
1450 case Intrinsic::amdgcn_ds_ordered_swap:
1451 case Intrinsic::amdgcn_flat_atomic_fadd:
1452 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1453 case Intrinsic::amdgcn_flat_atomic_fmax:
1454 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1455 case Intrinsic::amdgcn_flat_atomic_fmin:
1456 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1457 case Intrinsic::amdgcn_global_atomic_csub:
1458 case Intrinsic::amdgcn_global_atomic_fadd:
1459 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1460 case Intrinsic::amdgcn_global_atomic_fmax:
1461 case Intrinsic::amdgcn_global_atomic_fmax_num:
1462 case Intrinsic::amdgcn_global_atomic_fmin:
1463 case Intrinsic::amdgcn_global_atomic_fmin_num:
1464 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1465 case Intrinsic::amdgcn_global_load_tr:
1468 case Intrinsic::amdgcn_global_load_lds:
1479bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM,
1485 return AM.BaseOffs == 0 && AM.Scale == 0;
1488 return AM.Scale == 0 &&
1490 AM.BaseOffs, AddrSpace, FlatVariant));
1512 return isLegalMUBUFAddressingMode(AM);
1515bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1526 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1538 if (AM.HasBaseReg) {
1569 return isLegalMUBUFAddressingMode(AM);
1576 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1617 : isLegalMUBUFAddressingMode(AM);
1665 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1679 Alignment < RequiredAlignment)
1700 RequiredAlignment =
Align(4);
1718 *IsFast = (Alignment >= RequiredAlignment) ? 64
1719 : (Alignment <
Align(4)) ? 32
1741 *IsFast = (Alignment >= RequiredAlignment) ? 96
1742 : (Alignment <
Align(4)) ? 32
1755 RequiredAlignment =
Align(8);
1766 *IsFast = (Alignment >= RequiredAlignment) ? 128
1767 : (Alignment <
Align(4)) ? 32
1784 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1786 return Alignment >= RequiredAlignment ||
1791 bool AlignedBy4 = Alignment >=
Align(4);
1793 *IsFast = AlignedBy4;
1795 return AlignedBy4 ||
1805 bool AlignedBy4 = Alignment >=
Align(4);
1807 *IsFast = AlignedBy4;
1818 return Alignment >=
Align(4) ||
1832 return Size >= 32 && Alignment >=
Align(4);
1837 unsigned *IsFast)
const {
1839 Alignment, Flags, IsFast);
1849 if (
Op.size() >= 16 &&
1853 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1861 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1871 unsigned DestAS)
const {
1879 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1883 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1903 unsigned Index)
const {
1950 std::tie(InputPtrReg, RC, ArgTy) =
1960 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1966 const SDLoc &SL)
const {
1973 const SDLoc &SL)
const {
1976 std::optional<uint32_t> KnownSize =
1978 if (KnownSize.has_value())
2005 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2014SDValue SITargetLowering::lowerKernargMemParameter(
2026 int64_t OffsetDiff =
Offset - AlignDownOffset;
2032 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2042 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2053 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2100 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2128 Reg = &WorkGroupIDX;
2129 RC = &AMDGPU::SReg_32RegClass;
2133 Reg = &WorkGroupIDY;
2134 RC = &AMDGPU::SReg_32RegClass;
2138 Reg = &WorkGroupIDZ;
2139 RC = &AMDGPU::SReg_32RegClass;
2170 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2174 "vector type argument should have been split");
2179 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2188 "unexpected vector split in ps argument type");
2202 Info->markPSInputAllocated(PSInputNum);
2204 Info->markPSInputEnabled(PSInputNum);
2221 if (
Info.hasWorkItemIDX()) {
2227 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2231 if (
Info.hasWorkItemIDY()) {
2237 unsigned Reg = AMDGPU::VGPR1;
2245 if (
Info.hasWorkItemIDZ()) {
2251 unsigned Reg = AMDGPU::VGPR2;
2271 if (RegIdx == ArgVGPRs.
size()) {
2278 unsigned Reg = ArgVGPRs[RegIdx];
2280 assert(Reg != AMDGPU::NoRegister);
2290 unsigned NumArgRegs) {
2293 if (RegIdx == ArgSGPRs.
size())
2296 unsigned Reg = ArgSGPRs[RegIdx];
2298 assert(Reg != AMDGPU::NoRegister);
2312 assert(Reg != AMDGPU::NoRegister);
2338 const unsigned Mask = 0x3ff;
2341 if (
Info.hasWorkItemIDX()) {
2343 Info.setWorkItemIDX(Arg);
2346 if (
Info.hasWorkItemIDY()) {
2348 Info.setWorkItemIDY(Arg);
2351 if (
Info.hasWorkItemIDZ())
2363 const unsigned Mask = 0x3ff;
2388 if (
Info.hasImplicitArgPtr())
2396 if (
Info.hasWorkGroupIDX())
2399 if (
Info.hasWorkGroupIDY())
2402 if (
Info.hasWorkGroupIDZ())
2405 if (
Info.hasLDSKernelId())
2417 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2424 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2430 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2438 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2453 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2459 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2474 unsigned LastExplicitArgOffset =
2477 bool InPreloadSequence =
true;
2479 for (
auto &Arg :
F.args()) {
2480 if (!InPreloadSequence || !Arg.hasInRegAttr())
2483 int ArgIdx = Arg.getArgNo();
2486 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2487 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2490 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2491 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2493 assert(ArgLocs[ArgIdx].isMemLoc());
2494 auto &ArgLoc = ArgLocs[InIdx];
2496 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2498 unsigned NumAllocSGPRs =
2499 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2502 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2503 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2504 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2508 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2509 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2511 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2513 InPreloadSequence =
false;
2519 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2521 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2523 if (PreloadRegs->
size() > 1)
2524 RC = &AMDGPU::SGPR_32RegClass;
2525 for (
auto &Reg : *PreloadRegs) {
2531 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2540 if (
Info.hasLDSKernelId()) {
2542 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2552 bool IsShader)
const {
2560 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2562 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2566 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2567 Info.hasWorkGroupIDY() +
2568 Info.hasWorkGroupIDZ() +
2569 Info.hasWorkGroupInfo();
2570 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2572 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2577 if (!HasArchitectedSGPRs) {
2578 if (
Info.hasWorkGroupIDX()) {
2580 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2584 if (
Info.hasWorkGroupIDY()) {
2586 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2590 if (
Info.hasWorkGroupIDZ()) {
2592 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2597 if (
Info.hasWorkGroupInfo()) {
2599 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2603 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2605 unsigned PrivateSegmentWaveByteOffsetReg;
2608 PrivateSegmentWaveByteOffsetReg =
2609 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2613 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2615 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2618 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2620 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2621 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2625 Info.getNumPreloadedSGPRs() >= 16);
2640 if (HasStackObjects)
2641 Info.setHasNonSpillStackObjects(
true);
2646 HasStackObjects =
true;
2650 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2652 if (!ST.enableFlatScratch()) {
2653 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2660 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2662 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2672 Info.setScratchRSrcReg(ReservedBufferReg);
2691 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2692 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2699 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2700 if (!
MRI.isLiveIn(Reg)) {
2701 Info.setStackPtrOffsetReg(Reg);
2706 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2713 if (ST.getFrameLowering()->hasFP(MF)) {
2714 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2720 return !
Info->isEntryFunction();
2732 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2741 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2742 RC = &AMDGPU::SGPR_64RegClass;
2743 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2744 RC = &AMDGPU::SGPR_32RegClass;
2750 Entry->addLiveIn(*
I);
2755 for (
auto *Exit : Exits)
2757 TII->get(TargetOpcode::COPY), *
I)
2775 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2794 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2795 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2801 !
Info->hasWorkGroupIDZ());
2820 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2821 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2824 Info->markPSInputAllocated(0);
2825 Info->markPSInputEnabled(0);
2836 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2837 if ((PsInputBits & 0x7F) == 0 ||
2838 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2841 }
else if (IsKernel) {
2844 Splits.
append(Ins.begin(), Ins.end());
2857 }
else if (!IsGraphics) {
2882 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2892 if (IsEntryFunc && VA.
isMemLoc()) {
2915 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2919 int64_t OffsetDiff =
Offset - AlignDownOffset;
2926 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2937 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2938 Ins[i].Flags.isSExt(), &Ins[i]);
2946 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2949 if (PreloadRegs.
size() == 1) {
2950 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2955 TRI->getRegSizeInBits(*RC)));
2963 for (
auto Reg : PreloadRegs) {
2970 PreloadRegs.size()),
2979 NewArg = convertArgType(DAG, VT, MemVT,
DL, CMemVT,
2980 Ins[i].Flags.isSExt(), &Ins[i]);
2985 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
2986 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2991 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3004 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
3005 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3016 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3017 RC = &AMDGPU::VGPR_32RegClass;
3018 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3019 RC = &AMDGPU::SGPR_32RegClass;
3072 auto &ArgUsageInfo =
3077 Info->setBytesInStackArgArea(StackArgSize);
3079 return Chains.
empty() ? Chain :
3103 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3104 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3105 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3128 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3146 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.
size();
I !=
E;
3147 ++
I, ++RealRVLocIdx) {
3151 SDValue Arg = OutVals[RealRVLocIdx];
3179 if (!
Info->isEntryFunction()) {
3185 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3187 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3203 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3220 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
3286 auto &ArgUsageInfo =
3288 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3317 std::tie(OutgoingArg, ArgRC, ArgTy) =
3325 std::tie(IncomingArg, IncomingArgRC, Ty) =
3327 assert(IncomingArgRC == ArgRC);
3330 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3338 InputReg = getImplicitArgPtr(DAG,
DL);
3340 std::optional<uint32_t> Id =
3342 if (Id.has_value()) {
3354 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3358 unsigned SpecialArgOffset =
3372 std::tie(OutgoingArg, ArgRC, Ty) =
3375 std::tie(OutgoingArg, ArgRC, Ty) =
3378 std::tie(OutgoingArg, ArgRC, Ty) =
3393 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3394 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3395 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3412 InputReg = InputReg.
getNode() ?
3421 InputReg = InputReg.
getNode() ?
3425 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3426 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3436 IncomingArgX ? *IncomingArgX :
3437 IncomingArgY ? *IncomingArgY :
3438 *IncomingArgZ, ~0u);
3445 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3486 if (Callee->isDivergent())
3493 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3497 if (!CallerPreserved)
3500 bool CCMatch = CallerCC == CalleeCC;
3513 if (Arg.hasByValAttr())
3527 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3528 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3537 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3571 if (IsChainCallConv) {
3575 RequestedExec = CLI.
Args.back();
3576 assert(RequestedExec.
Node &&
"No node for EXEC");
3581 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3582 CLI.
Outs.pop_back();
3586 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3587 CLI.
Outs.pop_back();
3592 "Haven't popped all the pieces of the EXEC mask");
3603 bool IsSibCall =
false;
3608 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
3617 "unsupported call to variadic function ");
3625 "unsupported required tail call to function ");
3630 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3634 "site marked musttail or on llvm.amdgcn.cs.chain");
3641 if (!TailCallOpt && IsTailCall)
3686 if (!IsSibCall || IsChainCallConv) {
3693 RegsToPass.emplace_back(IsChainCallConv
3694 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3695 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3702 MVT PtrVT = MVT::i32;
3705 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3733 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3741 int32_t
Offset = LocMemOffset;
3748 unsigned OpSize = Flags.isByVal() ?
3754 ? Flags.getNonZeroByValAlign()
3781 if (Outs[i].Flags.isByVal()) {
3783 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3786 Outs[i].Flags.getNonZeroByValAlign(),
3794 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3800 if (!MemOpChains.
empty())
3806 for (
auto &RegToPass : RegsToPass) {
3808 RegToPass.second, InGlue);
3817 if (IsTailCall && !IsSibCall) {
3822 std::vector<SDValue> Ops;
3823 Ops.push_back(Chain);
3824 Ops.push_back(Callee);
3844 if (IsChainCallConv)
3845 Ops.push_back(RequestedExec.
Node);
3849 for (
auto &RegToPass : RegsToPass) {
3851 RegToPass.second.getValueType()));
3856 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3857 assert(Mask &&
"Missing call preserved mask for calling convention");
3861 Ops.push_back(InGlue);
3880 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3885 Chain = Call.getValue(0);
3886 InGlue = Call.getValue(1);
3888 uint64_t CalleePopBytes = NumBytes;
3907 EVT VT =
Op.getValueType();
3922 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3933 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3934 if (Alignment && *Alignment > StackAlign) {
3955 if (isa<ConstantSDNode>(
Size))
3962 if (
Op.getValueType() != MVT::i32)
3981 assert(
Op.getValueType() == MVT::i32);
3990 Op.getOperand(0), IntrinID, GetRoundBothImm);
4024 SDValue RoundModeTimesNumBits =
4044 TableEntry, EnumOffset);
4050 if (
Op->isDivergent())
4053 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4069 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4070 EVT SrcVT = Src.getValueType();
4079 EVT DstVT =
Op.getValueType();
4088 if (
Op.getValueType() != MVT::i64)
4102 Op.getOperand(0), IntrinID, ModeHwRegImm);
4104 Op.getOperand(0), IntrinID, TrapHwRegImm);
4118 if (
Op.getOperand(1).getValueType() != MVT::i64)
4130 ReadFirstLaneID, NewModeReg);
4132 ReadFirstLaneID, NewTrapReg);
4134 unsigned ModeHwReg =
4137 unsigned TrapHwReg =
4145 IntrinID, ModeHwRegImm, NewModeReg);
4148 IntrinID, TrapHwRegImm, NewTrapReg);
4155 .
Case(
"m0", AMDGPU::M0)
4156 .
Case(
"exec", AMDGPU::EXEC)
4157 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4158 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4159 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4160 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4161 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4164 if (Reg == AMDGPU::NoRegister) {
4178 case AMDGPU::EXEC_LO:
4179 case AMDGPU::EXEC_HI:
4180 case AMDGPU::FLAT_SCR_LO:
4181 case AMDGPU::FLAT_SCR_HI:
4186 case AMDGPU::FLAT_SCR:
4205 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4214static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4236 auto Next = std::next(
I);
4249 return std::pair(LoopBB, RemainderBB);
4256 auto I =
MI.getIterator();
4257 auto E = std::next(
I);
4279 Src->setIsKill(
false);
4295 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4298 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4320 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4321 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4330 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4331 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4332 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4333 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4341 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4348 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4352 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4357 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4358 : AMDGPU::S_AND_SAVEEXEC_B64),
4362 MRI.setSimpleHint(NewExec, CondReg);
4364 if (UseGPRIdxMode) {
4366 SGPRIdxReg = CurrentIdxReg;
4368 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4369 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4376 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4379 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4386 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4388 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4389 : AMDGPU::S_XOR_B64_term), Exec)
4410 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4411 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4419 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4421 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4422 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4423 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4424 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4439 InitResultReg, DstReg, PhiReg, TmpExec,
4440 Offset, UseGPRIdxMode, SGPRIdxReg);
4457static std::pair<unsigned, int>
4462 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4467 return std::pair(AMDGPU::sub0,
Offset);
4481 assert(
Idx->getReg() != AMDGPU::NoRegister);
4502 return Idx->getReg();
4504 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4521 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4522 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4531 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4534 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4538 if (UseGPRIdxMode) {
4545 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4558 MI.eraseFromParent();
4567 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4568 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4574 UseGPRIdxMode, SGPRIdxReg);
4578 if (UseGPRIdxMode) {
4580 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4582 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4587 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4592 MI.eraseFromParent();
4609 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4620 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4622 if (
Idx->getReg() == AMDGPU::NoRegister) {
4633 MI.eraseFromParent();
4638 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4642 if (UseGPRIdxMode) {
4646 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4655 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4656 TRI.getRegSizeInBits(*VecRC), 32,
false);
4662 MI.eraseFromParent();
4672 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4676 UseGPRIdxMode, SGPRIdxReg);
4679 if (UseGPRIdxMode) {
4681 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4683 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4689 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4690 TRI.getRegSizeInBits(*VecRC), 32,
false);
4691 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4697 MI.eraseFromParent();
4712 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4740 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4741 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4743 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4744 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4745 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4747 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4748 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4750 bool IsWave32 = ST.isWave32();
4751 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4752 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4757 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4760 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4765 I = ComputeLoop->end();
4767 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4771 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4772 .
addReg(TmpSReg->getOperand(0).getReg())
4776 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4777 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4778 .
addReg(ActiveBits->getOperand(0).getReg());
4779 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4780 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4782 .
addReg(FF1->getOperand(0).getReg());
4783 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4785 .
addReg(LaneValue->getOperand(0).getReg());
4788 unsigned BITSETOpc =
4789 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4790 auto NewActiveBits =
4791 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4792 .
addReg(FF1->getOperand(0).getReg())
4793 .
addReg(ActiveBits->getOperand(0).getReg());
4796 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4797 .addMBB(ComputeLoop);
4798 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4799 .addMBB(ComputeLoop);
4802 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4804 .
addReg(NewActiveBits->getOperand(0).getReg())
4806 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4811 MI.eraseFromParent();
4822 switch (
MI.getOpcode()) {
4823 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4825 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4827 case AMDGPU::S_UADDO_PSEUDO:
4828 case AMDGPU::S_USUBO_PSEUDO: {
4835 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4837 : AMDGPU::S_SUB_I32;
4844 MI.eraseFromParent();
4847 case AMDGPU::S_ADD_U64_PSEUDO:
4848 case AMDGPU::S_SUB_U64_PSEUDO: {
4857 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4859 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4867 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4868 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4871 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4873 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4876 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4878 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4880 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4881 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4894 MI.eraseFromParent();
4897 case AMDGPU::V_ADD_U64_PSEUDO:
4898 case AMDGPU::V_SUB_U64_PSEUDO: {
4904 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4910 if (IsAdd && ST.hasLshlAddB64()) {
4916 TII->legalizeOperands(*
Add);
4917 MI.eraseFromParent();
4921 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4923 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4924 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4926 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
4927 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
4931 : &AMDGPU::VReg_64RegClass;
4934 : &AMDGPU::VReg_64RegClass;
4937 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4939 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4942 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4944 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4947 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4949 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4951 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4958 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4972 TII->legalizeOperands(*LoHalf);
4973 TII->legalizeOperands(*HiHalf);
4974 MI.eraseFromParent();
4977 case AMDGPU::S_ADD_CO_PSEUDO:
4978 case AMDGPU::S_SUB_CO_PSEUDO: {
4992 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4993 ? AMDGPU::S_ADDC_U32
4994 : AMDGPU::S_SUBB_U32;
4996 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4997 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5002 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5003 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5007 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5009 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5015 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5016 assert(WaveSize == 64 || WaveSize == 32);
5018 if (WaveSize == 64) {
5019 if (ST.hasScalarCompareEq64()) {
5025 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5027 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5029 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5030 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5032 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5049 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5055 MI.eraseFromParent();
5058 case AMDGPU::SI_INIT_M0: {
5060 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5061 .
add(
MI.getOperand(0));
5062 MI.eraseFromParent();
5065 case AMDGPU::GET_GROUPSTATICSIZE: {
5070 .
add(
MI.getOperand(0))
5072 MI.eraseFromParent();
5075 case AMDGPU::GET_SHADERCYCLESHILO: {
5089 using namespace AMDGPU::Hwreg;
5090 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5092 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5093 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5095 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5096 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5098 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5102 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5107 .
add(
MI.getOperand(0))
5112 MI.eraseFromParent();
5115 case AMDGPU::SI_INDIRECT_SRC_V1:
5116 case AMDGPU::SI_INDIRECT_SRC_V2:
5117 case AMDGPU::SI_INDIRECT_SRC_V4:
5118 case AMDGPU::SI_INDIRECT_SRC_V8:
5119 case AMDGPU::SI_INDIRECT_SRC_V9:
5120 case AMDGPU::SI_INDIRECT_SRC_V10:
5121 case AMDGPU::SI_INDIRECT_SRC_V11:
5122 case AMDGPU::SI_INDIRECT_SRC_V12:
5123 case AMDGPU::SI_INDIRECT_SRC_V16:
5124 case AMDGPU::SI_INDIRECT_SRC_V32:
5126 case AMDGPU::SI_INDIRECT_DST_V1:
5127 case AMDGPU::SI_INDIRECT_DST_V2:
5128 case AMDGPU::SI_INDIRECT_DST_V4:
5129 case AMDGPU::SI_INDIRECT_DST_V8:
5130 case AMDGPU::SI_INDIRECT_DST_V9:
5131 case AMDGPU::SI_INDIRECT_DST_V10:
5132 case AMDGPU::SI_INDIRECT_DST_V11:
5133 case AMDGPU::SI_INDIRECT_DST_V12:
5134 case AMDGPU::SI_INDIRECT_DST_V16:
5135 case AMDGPU::SI_INDIRECT_DST_V32:
5137 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5138 case AMDGPU::SI_KILL_I1_PSEUDO:
5140 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5149 Register SrcCond =
MI.getOperand(3).getReg();
5151 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5152 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5153 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5154 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5158 : &AMDGPU::VReg_64RegClass;
5161 : &AMDGPU::VReg_64RegClass;
5164 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5166 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5169 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5171 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5174 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5176 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5198 MI.eraseFromParent();
5201 case AMDGPU::SI_BR_UNDEF: {
5205 .
add(
MI.getOperand(0));
5207 MI.eraseFromParent();
5210 case AMDGPU::ADJCALLSTACKUP:
5211 case AMDGPU::ADJCALLSTACKDOWN: {
5218 case AMDGPU::SI_CALL_ISEL: {
5222 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5225 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5227 for (
unsigned I = 0,
E =
MI.getNumOperands();
I !=
E; ++
I) {
5240 if (Def->getOpcode() != TargetOpcode::IMPLICIT_DEF) {
5247 MI.eraseFromParent();
5250 case AMDGPU::V_ADD_CO_U32_e32:
5251 case AMDGPU::V_SUB_CO_U32_e32:
5252 case AMDGPU::V_SUBREV_CO_U32_e32: {
5255 unsigned Opc =
MI.getOpcode();
5257 bool NeedClampOperand =
false;
5258 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5260 NeedClampOperand =
true;
5264 if (
TII->isVOP3(*
I)) {
5269 I.add(
MI.getOperand(1))
5270 .add(
MI.getOperand(2));
5271 if (NeedClampOperand)
5274 TII->legalizeOperands(*
I);
5276 MI.eraseFromParent();
5279 case AMDGPU::V_ADDC_U32_e32:
5280 case AMDGPU::V_SUBB_U32_e32:
5281 case AMDGPU::V_SUBBREV_U32_e32:
5284 TII->legalizeOperands(
MI);
5286 case AMDGPU::DS_GWS_INIT:
5287 case AMDGPU::DS_GWS_SEMA_BR:
5288 case AMDGPU::DS_GWS_BARRIER:
5289 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5291 case AMDGPU::DS_GWS_SEMA_V:
5292 case AMDGPU::DS_GWS_SEMA_P:
5293 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5301 case AMDGPU::S_SETREG_B32: {
5316 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5317 const unsigned SetMask = WidthMask <<
Offset;
5320 unsigned SetDenormOp = 0;
5321 unsigned SetRoundOp = 0;
5329 SetRoundOp = AMDGPU::S_ROUND_MODE;
5330 SetDenormOp = AMDGPU::S_DENORM_MODE;
5332 SetRoundOp = AMDGPU::S_ROUND_MODE;
5334 SetDenormOp = AMDGPU::S_DENORM_MODE;
5337 if (SetRoundOp || SetDenormOp) {
5340 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5341 unsigned ImmVal = Def->getOperand(1).getImm();
5355 MI.eraseFromParent();
5364 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5368 case AMDGPU::S_INVERSE_BALLOT_U32:
5369 case AMDGPU::S_INVERSE_BALLOT_U64: {
5374 const Register DstReg =
MI.getOperand(0).getReg();
5375 Register MaskReg =
MI.getOperand(1).getReg();
5377 const bool IsVALU =
TRI->isVectorRegister(
MRI, MaskReg);
5380 MaskReg =
TII->readlaneVGPRToSGPR(MaskReg,
MI,
MRI);
5384 MI.eraseFromParent();
5387 case AMDGPU::ENDPGM_TRAP: {
5390 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5408 MI.eraseFromParent();
5440 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5527 EVT VT =
N->getValueType(0);
5531 if (VT == MVT::f16) {
5547 unsigned Opc =
Op.getOpcode();
5548 EVT VT =
Op.getValueType();
5549 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5550 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5551 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5552 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5570 unsigned Opc =
Op.getOpcode();
5571 EVT VT =
Op.getValueType();
5572 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5573 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5574 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5575 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5594 unsigned Opc =
Op.getOpcode();
5595 EVT VT =
Op.getValueType();
5596 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5597 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5598 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5599 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5600 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5601 VT == MVT::v32bf16);
5607 : std::pair(Op0, Op0);
5626 switch (
Op.getOpcode()) {
5632 assert((!Result.getNode() ||
5633 Result.getNode()->getNumValues() == 2) &&
5634 "Load should return a value and a chain");
5638 EVT VT =
Op.getValueType();
5640 return lowerFSQRTF32(
Op, DAG);
5642 return lowerFSQRTF64(
Op, DAG);
5647 return LowerTrig(
Op, DAG);
5656 return LowerGlobalAddress(MFI,
Op, DAG);
5663 return lowerINSERT_SUBVECTOR(
Op, DAG);
5665 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5667 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5669 return lowerVECTOR_SHUFFLE(
Op, DAG);
5671 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5673 return lowerBUILD_VECTOR(
Op, DAG);
5676 return lowerFP_ROUND(
Op, DAG);
5681 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
5685 int RoundMode =
Op.getConstantOperandVal(1);
5693 return DAG.
getNode(Opc,
DL,
Op.getNode()->getVTList(),
Op->getOperand(0));
5696 return lowerTRAP(
Op, DAG);
5698 return lowerDEBUGTRAP(
Op, DAG);
5706 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5709 return lowerFLDEXP(
Op, DAG);
5734 return lowerMUL(
Op, DAG);
5737 return lowerXMULO(
Op, DAG);
5740 return lowerXMUL_LOHI(
Op, DAG);
5771 EVT FittingLoadVT = LoadVT;
5803SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5807 bool IsIntrinsic)
const {
5811 EVT LoadVT =
M->getValueType(0);
5813 EVT EquivLoadVT = LoadVT;
5832 VTList, Ops,
M->getMemoryVT(),
5833 M->getMemOperand());
5844 EVT LoadVT =
M->getValueType(0);
5850 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5851 bool IsTFE =
M->getNumValues() == 3;
5870 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand());
5873 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
5874 M->getMemOperand(), DAG);
5879 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
5880 M->getMemOperand(), DAG);
5888 EVT VT =
N->getValueType(0);
5889 unsigned CondCode =
N->getConstantOperandVal(3);
5900 EVT CmpVT =
LHS.getValueType();
5901 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
5922 EVT VT =
N->getValueType(0);
5924 unsigned CondCode =
N->getConstantOperandVal(3);
5933 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
5951 EVT VT =
N->getValueType(0);
5958 Src.getOperand(1), Src.getOperand(2));
5969 Exec = AMDGPU::EXEC_LO;
5971 Exec = AMDGPU::EXEC;
5989 switch (
N->getOpcode()) {
6001 unsigned IID =
N->getConstantOperandVal(0);
6003 case Intrinsic::amdgcn_make_buffer_rsrc:
6004 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6006 case Intrinsic::amdgcn_cvt_pkrtz: {
6015 case Intrinsic::amdgcn_cvt_pknorm_i16:
6016 case Intrinsic::amdgcn_cvt_pknorm_u16:
6017 case Intrinsic::amdgcn_cvt_pk_i16:
6018 case Intrinsic::amdgcn_cvt_pk_u16: {
6024 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6026 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6028 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6033 EVT VT =
N->getValueType(0);
6042 case Intrinsic::amdgcn_s_buffer_load: {
6054 EVT VT =
Op.getValueType();
6055 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6067 if (!
Offset->isDivergent()) {
6086 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6098 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6099 Results.push_back(Res.getOperand(
I));
6103 Results.push_back(Res.getValue(1));
6112 EVT VT =
N->getValueType(0);
6117 EVT SelectVT = NewVT;
6118 if (NewVT.
bitsLT(MVT::i32)) {
6121 SelectVT = MVT::i32;
6127 if (NewVT != SelectVT)
6133 if (
N->getValueType(0) != MVT::v2f16)
6146 if (
N->getValueType(0) != MVT::v2f16)
6159 if (
N->getValueType(0) != MVT::f16)
6177 if (
I.getUse().get() !=
Value)
6180 if (
I->getOpcode() == Opcode)
6186unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6188 switch (
Intr->getConstantOperandVal(1)) {
6189 case Intrinsic::amdgcn_if:
6191 case Intrinsic::amdgcn_else:
6193 case Intrinsic::amdgcn_loop:
6195 case Intrinsic::amdgcn_end_cf:
6243 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6256 assert(BR &&
"brcond missing unconditional branch user");
6257 Target = BR->getOperand(1);
6260 unsigned CFNode = isCFIntrinsic(
Intr);
6279 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6309 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6326 Intr->getOperand(0));
6333 MVT VT =
Op.getSimpleValueType();
6336 if (
Op.getConstantOperandVal(0) != 0)
6342 if (
Info->isEntryFunction())
6360 return Op.getValueType().bitsLE(VT) ?
6367 assert(
Op.getValueType() == MVT::f16 &&
6368 "Do not know how to custom lower FP_ROUND for non-f16 type");
6371 EVT SrcVT = Src.getValueType();
6372 if (SrcVT != MVT::f64)
6388 EVT VT =
Op.getValueType();
6391 bool IsIEEEMode =
Info->getMode().IEEE;
6400 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6408 EVT VT =
Op.getValueType();
6412 EVT ExpVT =
Exp.getValueType();
6413 if (ExpVT == MVT::i16)
6434 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6442 EVT VT =
Op.getValueType();
6448 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6475 if (
Op->isDivergent())
6488 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6490 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6493 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6495 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6501 EVT VT =
Op.getValueType();
6508 const APInt &
C = RHSC->getAPIntValue();
6510 if (
C.isPowerOf2()) {
6512 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6517 SL, VT, Result, ShiftAmt),
6537 if (
Op->isDivergent()) {
6554 return lowerTrapEndpgm(
Op, DAG);
6557 lowerTrapHsaQueuePtr(
Op, DAG);
6560SDValue SITargetLowering::lowerTrapEndpgm(
6568 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6578SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6588 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6594 if (UserSGPR == AMDGPU::NoRegister) {
6619SDValue SITargetLowering::lowerTrapHsa(
6640 "debugtrap handler not supported",
6656SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6660 ? AMDGPU::SRC_SHARED_BASE
6661 : AMDGPU::SRC_PRIVATE_BASE;
6684 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6693 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
6699 if (UserSGPR == AMDGPU::NoRegister) {
6706 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6729 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6730 isa<BasicBlockSDNode>(Val))
6733 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6734 return ConstVal->getSExtValue() !=
TM.getNullPointerValue(AddrSpace);
6748 unsigned DestAS, SrcAS;
6750 bool IsNonNull =
false;
6751 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
6752 SrcAS = ASC->getSrcAddressSpace();
6753 Src = ASC->getOperand(0);
6754 DestAS = ASC->getDestAddressSpace();
6757 Op.getConstantOperandVal(0) ==
6758 Intrinsic::amdgcn_addrspacecast_nonnull);
6759 Src =
Op->getOperand(1);
6760 SrcAS =
Op->getConstantOperandVal(2);
6761 DestAS =
Op->getConstantOperandVal(3);
6776 unsigned NullVal =
TM.getNullPointerValue(DestAS);
6790 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6798 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
6810 Op.getValueType() == MVT::i64) {
6819 Src.getValueType() == MVT::i64)
6843 EVT InsVT =
Ins.getValueType();
6846 unsigned IdxVal =
Idx->getAsZExtVal();
6851 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
6856 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6858 MVT::i32, InsNumElts / 2);
6863 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
6865 if (InsNumElts == 2) {
6878 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
6900 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
6901 if (NumElts == 4 && EltSize == 16 && KIdx) {
6912 unsigned Idx = KIdx->getZExtValue();
6913 bool InsertLo =
Idx < 2;
6915 InsertLo ? LoVec : HiVec,
6930 if (isa<ConstantSDNode>(
Idx))
6936 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
6942 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6958 DAG.
getNOT(SL, BFM, IntVT), BCVec);
6970 EVT ResultVT =
Op.getValueType();
6983 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
6986 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
6991 if (VecSize == 128) {
6999 }
else if (VecSize == 256) {
7002 for (
unsigned P = 0;
P < 4; ++
P) {
7008 Parts[0], Parts[1]));
7010 Parts[2], Parts[3]));
7016 for (
unsigned P = 0;
P < 8; ++
P) {
7023 Parts[0], Parts[1], Parts[2], Parts[3]));
7026 Parts[4], Parts[5],Parts[6], Parts[7]));
7029 EVT IdxVT =
Idx.getValueType();
7046 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7061 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7071 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7077 EVT ResultVT =
Op.getValueType();
7080 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7082 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7098 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7099 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7107 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7108 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7109 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7110 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7129 EVT ResultVT =
Op.getValueType();
7145 EVT VT =
Op.getValueType();
7147 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7148 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7167 { CastLo, CastHi });
7171 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7178 for (
unsigned P = 0;
P < 4; ++
P)
7179 Parts[
P].push_back(
Op.getOperand(
I +
P *
E));
7182 for (
unsigned P = 0;
P < 4; ++
P) {
7192 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7199 for (
unsigned P = 0;
P < 8; ++
P)
7200 Parts[
P].push_back(
Op.getOperand(
I +
P *
E));
7203 for (
unsigned P = 0;
P < 8; ++
P) {
7213 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7265 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7303 EVT PtrVT =
Op.getValueType();
7319 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7392 SDValue Param = lowerKernargMemParameter(
7402 "non-hsa intrinsic with hsa target",
7411 "intrinsic not supported on subtarget",
7421 unsigned NumElts = Elts.
size();
7423 if (NumElts <= 12) {
7432 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7438 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7439 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7448 EVT SrcVT = Src.getValueType();
7469 bool Unpacked,
bool IsD16,
int DMaskPop,
7470 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7473 EVT ReqRetVT = ResultTypes[0];
7475 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7476 ? (ReqRetNumElts + 1) / 2
7479 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7480 DMaskPop : (DMaskPop + 1) / 2;
7482 MVT DataDwordVT = NumDataDwords == 1 ?
7485 MVT MaskPopVT = MaskPopDwords == 1 ?
7491 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7502 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7504 NumDataDwords - MaskPopDwords);
7509 EVT LegalReqRetVT = ReqRetVT;
7511 if (!
Data.getValueType().isInteger())
7513 Data.getValueType().changeTypeToInteger(),
Data);
7534 if (Result->getNumValues() == 1)
7541 SDValue *LWE,
bool &IsTexFail) {
7542 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7561 unsigned DimIdx,
unsigned EndIdx,
7562 unsigned NumGradients) {
7564 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7572 if (((
I + 1) >= EndIdx) ||
7573 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7574 I == DimIdx + NumGradients - 1))) {
7575 if (
Addr.getValueType() != MVT::i16)
7596 unsigned IntrOpcode =
Intr->BaseOpcode;
7608 bool AdjustRetType =
false;
7609 bool IsAtomicPacked16Bit =
false;
7612 const unsigned ArgOffset = WithChain ? 2 : 1;
7615 unsigned DMaskLanes = 0;
7617 if (BaseOpcode->Atomic) {
7618 VData =
Op.getOperand(2);
7620 IsAtomicPacked16Bit =
7621 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7622 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7625 if (BaseOpcode->AtomicX2) {
7632 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7633 DMask = Is64Bit ? 0xf : 0x3;
7634 NumVDataDwords = Is64Bit ? 4 : 2;
7636 DMask = Is64Bit ? 0x3 : 0x1;
7637 NumVDataDwords = Is64Bit ? 2 : 1;
7640 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7643 if (BaseOpcode->Store) {
7644 VData =
Op.getOperand(2);
7652 VData = handleD16VData(VData, DAG,
true);
7669 (!LoadVT.
isVector() && DMaskLanes > 1))
7677 NumVDataDwords = (DMaskLanes + 1) / 2;
7679 NumVDataDwords = DMaskLanes;
7681 AdjustRetType =
true;
7685 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
7690 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
7692 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7693 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7695 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
7697 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7698 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7701 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
7702 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
7703 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
7708 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
7712 "Bias needs to be converted to 16 bit in A16 mode");
7717 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
7721 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
7722 "require 16 bit args for both gradients and addresses");
7727 if (!
ST->hasA16()) {
7728 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
7729 "support 16 bit addresses\n");
7739 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
7743 IntrOpcode = G16MappingInfo->
G16;
7751 ArgOffset +
Intr->GradientStart,
7752 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
7754 for (
unsigned I = ArgOffset +
Intr->GradientStart;
7755 I < ArgOffset + Intr->CoordStart;
I++)
7762 ArgOffset +
Intr->CoordStart, VAddrEnd,
7766 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
7784 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
7785 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
7786 const bool UseNSA =
ST->hasNSAEncoding() &&
7787 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
7788 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
7789 const bool UsePartialNSA =
7790 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
7793 if (UsePartialNSA) {
7795 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7804 if (!BaseOpcode->Sampler) {
7808 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
7810 Unorm = UnormConst ? True : False;
7815 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
7816 bool IsTexFail =
false;
7817 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7828 NumVDataDwords += 1;
7829 AdjustRetType =
true;
7834 if (AdjustRetType) {
7836 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7839 if (isa<MemSDNode>(
Op))
7844 EVT NewVT = NumVDataDwords > 1 ?
7848 ResultTypes[0] = NewVT;
7849 if (ResultTypes.size() == 3) {
7853 ResultTypes.erase(&ResultTypes[1]);
7857 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
7858 if (BaseOpcode->Atomic)
7865 if (BaseOpcode->Store || BaseOpcode->Atomic)
7867 if (UsePartialNSA) {
7876 if (BaseOpcode->Sampler)
7881 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7885 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7893 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7897 if (BaseOpcode->HasD16)
7899 if (isa<MemSDNode>(
Op))
7902 int NumVAddrDwords =
7908 NumVDataDwords, NumVAddrDwords);
7909 }
else if (IsGFX11Plus) {
7911 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7912 : AMDGPU::MIMGEncGfx11Default,
7913 NumVDataDwords, NumVAddrDwords);
7914 }
else if (IsGFX10Plus) {
7916 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7917 : AMDGPU::MIMGEncGfx10Default,
7918 NumVDataDwords, NumVAddrDwords);
7922 NumVDataDwords, NumVAddrDwords);
7925 "requested image instruction is not supported on this GPU");
7930 NumVDataDwords, NumVAddrDwords);
7933 NumVDataDwords, NumVAddrDwords);
7939 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
7944 if (BaseOpcode->AtomicX2) {
7949 if (BaseOpcode->Store)
7953 NumVDataDwords, IsAtomicPacked16Bit,
DL);
7971 if (!
Offset->isDivergent()) {
8016 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8020 unsigned NumLoads = 1;
8026 if (NumElts == 8 || NumElts == 16) {
8027 NumLoads = NumElts / 4;
8035 setBufferOffsets(
Offset, DAG, &Ops[3],
8036 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8039 for (
unsigned i = 0; i < NumLoads; ++i) {
8045 if (NumElts == 8 || NumElts == 16)
8092 EVT VT =
Op.getValueType();
8094 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8098 switch (IntrinsicID) {
8099 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8102 return getPreloadedValue(DAG, *MFI, VT,
8105 case Intrinsic::amdgcn_dispatch_ptr:
8106 case Intrinsic::amdgcn_queue_ptr: {
8109 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8115 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8117 return getPreloadedValue(DAG, *MFI, VT, RegID);
8119 case Intrinsic::amdgcn_implicitarg_ptr: {
8121 return getImplicitArgPtr(DAG,
DL);
8122 return getPreloadedValue(DAG, *MFI, VT,
8125 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8131 return getPreloadedValue(DAG, *MFI, VT,
8134 case Intrinsic::amdgcn_dispatch_id: {
8137 case Intrinsic::amdgcn_rcp:
8139 case Intrinsic::amdgcn_rsq:
8141 case Intrinsic::amdgcn_rsq_legacy:
8145 case Intrinsic::amdgcn_rcp_legacy:
8149 case Intrinsic::amdgcn_rsq_clamp: {
8163 case Intrinsic::r600_read_ngroups_x:
8167 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8170 case Intrinsic::r600_read_ngroups_y:
8174 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8177 case Intrinsic::r600_read_ngroups_z:
8181 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8184 case Intrinsic::r600_read_global_size_x:
8188 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8191 case Intrinsic::r600_read_global_size_y:
8195 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8198 case Intrinsic::r600_read_global_size_z:
8202 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8205 case Intrinsic::r600_read_local_size_x:
8209 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8211 case Intrinsic::r600_read_local_size_y:
8215 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8217 case Intrinsic::r600_read_local_size_z:
8221 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8223 case Intrinsic::amdgcn_workgroup_id_x:
8224 return getPreloadedValue(DAG, *MFI, VT,
8226 case Intrinsic::amdgcn_workgroup_id_y:
8227 return getPreloadedValue(DAG, *MFI, VT,
8229 case Intrinsic::amdgcn_workgroup_id_z:
8230 return getPreloadedValue(DAG, *MFI, VT,
8232 case Intrinsic::amdgcn_wave_id:
8233 return lowerWaveID(DAG,
Op);
8234 case Intrinsic::amdgcn_lds_kernel_id: {
8236 return getLDSKernelId(DAG,
DL);
8237 return getPreloadedValue(DAG, *MFI, VT,
8240 case Intrinsic::amdgcn_workitem_id_x:
8241 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8242 case Intrinsic::amdgcn_workitem_id_y:
8243 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8244 case Intrinsic::amdgcn_workitem_id_z:
8245 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8246 case Intrinsic::amdgcn_wavefrontsize:
8249 case Intrinsic::amdgcn_s_buffer_load: {
8250 unsigned CPol =
Op.getConstantOperandVal(3);
8257 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8260 case Intrinsic::amdgcn_fdiv_fast:
8261 return lowerFDIV_FAST(
Op, DAG);
8262 case Intrinsic::amdgcn_sin:
8265 case Intrinsic::amdgcn_cos:
8268 case Intrinsic::amdgcn_mul_u24:
8270 case Intrinsic::amdgcn_mul_i24:
8273 case Intrinsic::amdgcn_log_clamp: {
8279 case Intrinsic::amdgcn_fract:
8282 case Intrinsic::amdgcn_class:
8284 Op.getOperand(1),
Op.getOperand(2));
8285 case Intrinsic::amdgcn_div_fmas:
8287 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8290 case Intrinsic::amdgcn_div_fixup:
8292 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8294 case Intrinsic::amdgcn_div_scale: {
8307 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8310 Denominator, Numerator);
8312 case Intrinsic::amdgcn_icmp: {
8314 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8315 Op.getConstantOperandVal(2) == 0 &&
8320 case Intrinsic::amdgcn_fcmp: {
8323 case Intrinsic::amdgcn_ballot:
8325 case Intrinsic::amdgcn_fmed3:
8327 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8328 case Intrinsic::amdgcn_fdot2:
8330 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8332 case Intrinsic::amdgcn_fmul_legacy:
8334 Op.getOperand(1),
Op.getOperand(2));
8335 case Intrinsic::amdgcn_sffbh:
8337 case Intrinsic::amdgcn_sbfe:
8339 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8340 case Intrinsic::amdgcn_ubfe:
8342 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8343 case Intrinsic::amdgcn_cvt_pkrtz:
8344 case Intrinsic::amdgcn_cvt_pknorm_i16:
8345 case Intrinsic::amdgcn_cvt_pknorm_u16:
8346 case Intrinsic::amdgcn_cvt_pk_i16:
8347 case Intrinsic::amdgcn_cvt_pk_u16: {
8349 EVT VT =
Op.getValueType();
8352 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8354 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8356 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8358 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8364 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8367 Op.getOperand(1),
Op.getOperand(2));
8370 case Intrinsic::amdgcn_fmad_ftz:
8372 Op.getOperand(2),
Op.getOperand(3));
8374 case Intrinsic::amdgcn_if_break:
8376 Op->getOperand(1),
Op->getOperand(2)), 0);
8378 case Intrinsic::amdgcn_groupstaticsize: {
8390 case Intrinsic::amdgcn_is_shared:
8391 case Intrinsic::amdgcn_is_private: {
8393 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8395 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8403 case Intrinsic::amdgcn_perm:
8405 Op.getOperand(2),
Op.getOperand(3));
8406 case Intrinsic::amdgcn_reloc_constant: {
8410 auto RelocSymbol = cast<GlobalVariable>(
8416 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8417 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8418 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8419 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8420 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8421 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8422 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8423 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8424 if (
Op.getOperand(4).getValueType() == MVT::i32)
8430 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8431 Op.getOperand(3), IndexKeyi32);
8433 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8434 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8435 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8436 if (
Op.getOperand(6).getValueType() == MVT::i32)
8442 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8443 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8444 IndexKeyi32, Op.getOperand(7)});
8446 case Intrinsic::amdgcn_addrspacecast_nonnull:
8447 return lowerADDRSPACECAST(
Op, DAG);
8451 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8462 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8468 unsigned NewOpcode)
const {
8472 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8473 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8487 auto *
M = cast<MemSDNode>(
Op);
8491 M->getMemOperand());
8502 unsigned NewOpcode)
const {
8506 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8507 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8521 auto *
M = cast<MemSDNode>(
Op);
8525 M->getMemOperand());
8530 unsigned IntrID =
Op.getConstantOperandVal(1);
8534 case Intrinsic::amdgcn_ds_ordered_add:
8535 case Intrinsic::amdgcn_ds_ordered_swap: {
8540 unsigned IndexOperand =
M->getConstantOperandVal(7);
8541 unsigned WaveRelease =
M->getConstantOperandVal(8);
8542 unsigned WaveDone =
M->getConstantOperandVal(9);
8544 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8545 IndexOperand &= ~0x3f;
8546 unsigned CountDw = 0;
8549 CountDw = (IndexOperand >> 24) & 0xf;
8550 IndexOperand &= ~(0xf << 24);
8552 if (CountDw < 1 || CountDw > 4) {
8554 "ds_ordered_count: dword count must be between 1 and 4");
8561 if (WaveDone && !WaveRelease)
8564 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8565 unsigned ShaderType =
8567 unsigned Offset0 = OrderedCountIndex << 2;
8568 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8571 Offset1 |= (CountDw - 1) << 6;
8574 Offset1 |= ShaderType << 2;
8576 unsigned Offset = Offset0 | (Offset1 << 8);
8585 M->getVTList(), Ops,
M->getMemoryVT(),
8586 M->getMemOperand());
8588 case Intrinsic::amdgcn_ds_fadd: {
8592 case Intrinsic::amdgcn_ds_fadd:
8598 M->getOperand(0),
M->getOperand(2),
M->getOperand(3),
8599 M->getMemOperand());
8601 case Intrinsic::amdgcn_ds_fmin:
8602 case Intrinsic::amdgcn_ds_fmax: {
8606 case Intrinsic::amdgcn_ds_fmin:
8609 case Intrinsic::amdgcn_ds_fmax:
8622 M->getMemoryVT(),
M->getMemOperand());
8624 case Intrinsic::amdgcn_buffer_load:
8625 case Intrinsic::amdgcn_buffer_load_format: {
8626 unsigned Glc =
Op.getConstantOperandVal(5);
8627 unsigned Slc =
Op.getConstantOperandVal(6);
8639 setBufferOffsets(
Op.getOperand(4), DAG, &Ops[3]);
8641 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8644 EVT VT =
Op.getValueType();
8646 auto *
M = cast<MemSDNode>(
Op);
8647 EVT LoadVT =
Op.getValueType();
8655 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
8656 M->getMemOperand());
8658 return getMemIntrinsicNode(Opc,
DL,
Op->getVTList(), Ops, IntVT,
8659 M->getMemOperand(), DAG);
8661 case Intrinsic::amdgcn_raw_buffer_load:
8662 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8663 case Intrinsic::amdgcn_raw_buffer_load_format:
8664 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8665 const bool IsFormat =
8666 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8667 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8669 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8670 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8683 auto *
M = cast<MemSDNode>(
Op);
8684 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8686 case Intrinsic::amdgcn_struct_buffer_load:
8687 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8688 case Intrinsic::amdgcn_struct_buffer_load_format:
8689 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8690 const bool IsFormat =
8691 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8692 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8694 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8695 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8708 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8710 case Intrinsic::amdgcn_tbuffer_load: {
8712 EVT LoadVT =
Op.getValueType();
8715 unsigned Dfmt =
Op.getConstantOperandVal(7);
8716 unsigned Nfmt =
Op.getConstantOperandVal(8);
8717 unsigned Glc =
Op.getConstantOperandVal(9);
8718 unsigned Slc =
Op.getConstantOperandVal(10);
8736 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8739 case Intrinsic::amdgcn_raw_tbuffer_load:
8740 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8742 EVT LoadVT =
Op.getValueType();
8743 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8744 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8763 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8766 case Intrinsic::amdgcn_struct_tbuffer_load:
8767 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8769 EVT LoadVT =
Op.getValueType();
8770 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8771 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8790 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8793 case Intrinsic::amdgcn_buffer_atomic_swap:
8794 case Intrinsic::amdgcn_buffer_atomic_add:
8795 case Intrinsic::amdgcn_buffer_atomic_sub:
8796 case Intrinsic::amdgcn_buffer_atomic_csub:
8797 case Intrinsic::amdgcn_buffer_atomic_smin:
8798 case Intrinsic::amdgcn_buffer_atomic_umin:
8799 case Intrinsic::amdgcn_buffer_atomic_smax:
8800 case Intrinsic::amdgcn_buffer_atomic_umax:
8801 case Intrinsic::amdgcn_buffer_atomic_and:
8802 case Intrinsic::amdgcn_buffer_atomic_or:
8803 case Intrinsic::amdgcn_buffer_atomic_xor:
8804 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8805 unsigned Slc =
Op.getConstantOperandVal(6);
8818 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
8820 EVT VT =
Op.getValueType();
8822 auto *
M = cast<MemSDNode>(
Op);
8823 unsigned Opcode = 0;
8826 case Intrinsic::amdgcn_buffer_atomic_swap:
8829 case Intrinsic::amdgcn_buffer_atomic_add:
8832 case Intrinsic::amdgcn_buffer_atomic_sub:
8835 case Intrinsic::amdgcn_buffer_atomic_csub:
8838 case Intrinsic::amdgcn_buffer_atomic_smin:
8841 case Intrinsic::amdgcn_buffer_atomic_umin:
8844 case Intrinsic::amdgcn_buffer_atomic_smax:
8847 case Intrinsic::amdgcn_buffer_atomic_umax:
8850 case Intrinsic::amdgcn_buffer_atomic_and:
8853 case Intrinsic::amdgcn_buffer_atomic_or:
8856 case Intrinsic::amdgcn_buffer_atomic_xor:
8859 case Intrinsic::amdgcn_buffer_atomic_fadd:
8867 M->getMemOperand());
8869 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8870 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8872 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8873 return lowerRawBufferAtomicIntrin(
Op, DAG,
8875 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8876 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8878 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8879 return lowerStructBufferAtomicIntrin(
Op, DAG,
8881 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8884 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8885 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8887 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8890 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8893 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8896 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8899 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8902 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8905 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8908 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8911 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8914 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8917 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8918 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8920 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8921 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8923 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8924 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8926 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8929 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8930 return lowerRawBufferAtomicIntrin(
Op, DAG,
8932 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8933 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8934 return lowerStructBufferAtomicIntrin(
Op, DAG,
8936 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8937 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8939 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8942 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8943 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8944 return lowerStructBufferAtomicIntrin(
Op, DAG,
8946 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8948 return lowerStructBufferAtomicIntrin(
Op, DAG,
8950 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8951 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8952 return lowerStructBufferAtomicIntrin(
Op, DAG,
8954 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8955 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8956 return lowerStructBufferAtomicIntrin(
Op, DAG,
8958 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8961 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8964 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8965 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8967 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8968 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8970 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8971 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8973 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8974 return lowerStructBufferAtomicIntrin(
Op, DAG,
8977 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
8978 unsigned Slc =
Op.getConstantOperandVal(7);
8992 setBufferOffsets(
Op.getOperand(6), DAG, &Ops[5]);
8994 EVT VT =
Op.getValueType();
8995 auto *
M = cast<MemSDNode>(
Op);
8998 Op->getVTList(), Ops, VT,
M->getMemOperand());
9000 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9001 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9002 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9003 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9017 EVT VT =
Op.getValueType();
9018 auto *
M = cast<MemSDNode>(
Op);
9021 Op->getVTList(), Ops, VT,
M->getMemOperand());
9023 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9024 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9025 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9026 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
9040 EVT VT =
Op.getValueType();
9041 auto *
M = cast<MemSDNode>(
Op);
9044 Op->getVTList(), Ops, VT,
M->getMemOperand());
9046 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9048 SDValue NodePtr =
M->getOperand(2);
9049 SDValue RayExtent =
M->getOperand(3);
9050 SDValue RayOrigin =
M->getOperand(4);
9052 SDValue RayInvDir =
M->getOperand(6);
9070 const unsigned NumVDataDwords = 4;
9071 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9072 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9076 const unsigned BaseOpcodes[2][2] = {
9077 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9078 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9079 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9083 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9084 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9085 : AMDGPU::MIMGEncGfx10NSA,
9086 NumVDataDwords, NumVAddrDwords);
9090 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9091 : AMDGPU::MIMGEncGfx10Default,
9092 NumVDataDwords, NumVAddrDwords);
9098 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
9101 if (Lanes[0].getValueSizeInBits() == 32) {
9102 for (
unsigned I = 0;
I < 3; ++
I)
9109 { Lanes[0], Lanes[1] })));
9116 { Elt0, Lanes[0] })));
9120 { Lanes[1], Lanes[2] })));
9125 if (UseNSA && IsGFX11Plus) {
9133 for (
unsigned I = 0;
I < 3; ++
I) {
9136 {DirLanes[I], InvDirLanes[I]})));
9151 packLanes(RayOrigin,
true);
9152 packLanes(RayDir,
true);
9153 packLanes(RayInvDir,
false);
9158 if (NumVAddrDwords > 12) {
9178 case Intrinsic::amdgcn_global_atomic_fmin:
9179 case Intrinsic::amdgcn_global_atomic_fmax:
9180 case Intrinsic::amdgcn_global_atomic_fmin_num:
9181 case Intrinsic::amdgcn_global_atomic_fmax_num:
9182 case Intrinsic::amdgcn_flat_atomic_fmin:
9183 case Intrinsic::amdgcn_flat_atomic_fmax:
9184 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9185 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9192 unsigned Opcode = 0;
9194 case Intrinsic::amdgcn_global_atomic_fmin:
9195 case Intrinsic::amdgcn_global_atomic_fmin_num:
9196 case Intrinsic::amdgcn_flat_atomic_fmin:
9197 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9201 case Intrinsic::amdgcn_global_atomic_fmax:
9202 case Intrinsic::amdgcn_global_atomic_fmax_num:
9203 case Intrinsic::amdgcn_flat_atomic_fmax:
9204 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9212 M->getVTList(), Ops,
M->getMemoryVT(),
9213 M->getMemOperand());
9215 case Intrinsic::amdgcn_s_get_barrier_state: {
9219 bool IsInlinableBarID =
false;
9222 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9223 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9227 if (IsInlinableBarID) {
9228 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9232 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9244 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9252SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9262 bool IsTFE = VTList.
NumVTs == 3;
9265 unsigned NumOpDWords = NumValueDWords + 1;
9270 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9271 OpDWordsVT, OpDWordsMMO, DAG);
9286 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9292 WidenedMemVT, WidenedMMO);
9302 bool ImageStore)
const {
9337 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9343 if ((NumElements % 2) == 1) {
9345 unsigned I = Elts.
size() / 2;
9361 if (NumElements == 3) {
9382 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9385 switch (IntrinsicID) {
9386 case Intrinsic::amdgcn_exp_compr: {
9390 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9413 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9416 case Intrinsic::amdgcn_s_barrier: {
9419 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9420 if (WGSize <=
ST.getWavefrontSize())
9422 Op.getOperand(0)), 0);
9426 if (
ST.hasSplitBarriers()) {
9431 MVT::Other, K,
Op.getOperand(0)),
9442 case Intrinsic::amdgcn_tbuffer_store: {
9446 VData = handleD16VData(VData, DAG);
9447 unsigned Dfmt =
Op.getConstantOperandVal(8);
9448 unsigned Nfmt =
Op.getConstantOperandVal(9);
9449 unsigned Glc =
Op.getConstantOperandVal(10);
9450 unsigned Slc =
Op.getConstantOperandVal(11);
9468 M->getMemoryVT(),
M->getMemOperand());
9471 case Intrinsic::amdgcn_struct_tbuffer_store:
9472 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9476 VData = handleD16VData(VData, DAG);
9477 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9478 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9496 M->getMemoryVT(),
M->getMemOperand());
9499 case Intrinsic::amdgcn_raw_tbuffer_store:
9500 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9504 VData = handleD16VData(VData, DAG);
9505 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9506 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9524 M->getMemoryVT(),
M->getMemOperand());
9527 case Intrinsic::amdgcn_buffer_store:
9528 case Intrinsic::amdgcn_buffer_store_format: {
9532 VData = handleD16VData(VData, DAG);
9533 unsigned Glc =
Op.getConstantOperandVal(6);
9534 unsigned Slc =
Op.getConstantOperandVal(7);
9547 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
9549 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9556 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9557 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9560 M->getMemoryVT(),
M->getMemOperand());
9563 case Intrinsic::amdgcn_raw_buffer_store:
9564 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9565 case Intrinsic::amdgcn_raw_buffer_store_format:
9566 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9567 const bool IsFormat =
9568 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9569 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9576 VData = handleD16VData(VData, DAG);
9586 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9587 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9607 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9610 M->getMemoryVT(),
M->getMemOperand());
9613 case Intrinsic::amdgcn_struct_buffer_store:
9614 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9615 case Intrinsic::amdgcn_struct_buffer_store_format:
9616 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9617 const bool IsFormat =
9618 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9619 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9627 VData = handleD16VData(VData, DAG);
9637 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9638 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9659 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9662 M->getMemoryVT(),
M->getMemOperand());
9664 case Intrinsic::amdgcn_raw_buffer_load_lds:
9665 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9666 case Intrinsic::amdgcn_struct_buffer_load_lds:
9667 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9671 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9672 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9673 unsigned OpOffset = HasVIndex ? 1 : 0;
9674 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9676 unsigned Size =
Op->getConstantOperandVal(4);
9682 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9683 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9684 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9685 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9688 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9689 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9690 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9691 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9694 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9695 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9696 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9697 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9705 if (HasVIndex && HasVOffset)
9711 else if (HasVOffset)
9714 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9718 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9726 auto *
M = cast<MemSDNode>(
Op);
9753 case Intrinsic::amdgcn_global_load_lds: {
9755 unsigned Size =
Op->getConstantOperandVal(4);
9760 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9763 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9766 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9770 auto *
M = cast<MemSDNode>(
Op);
9783 if (
LHS->isDivergent())
9787 RHS.getOperand(0).getValueType() == MVT::i32) {
9790 VOffset =
RHS.getOperand(0);
9795 if (!
Addr->isDivergent()) {
9811 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9831 case Intrinsic::amdgcn_end_cf:
9833 Op->getOperand(2), Chain), 0);
9834 case Intrinsic::amdgcn_s_barrier_init:
9835 case Intrinsic::amdgcn_s_barrier_join:
9836 case Intrinsic::amdgcn_s_wakeup_barrier: {
9841 bool IsInlinableBarID =
false;
9844 if (isa<ConstantSDNode>(BarOp)) {
9845 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9849 if (IsInlinableBarID) {
9850 switch (IntrinsicID) {
9853 case Intrinsic::amdgcn_s_barrier_init:
9854 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9856 case Intrinsic::amdgcn_s_barrier_join:
9857 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9859 case Intrinsic::amdgcn_s_wakeup_barrier:
9860 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9867 switch (IntrinsicID) {
9870 case Intrinsic::amdgcn_s_barrier_init:
9871 Opc = AMDGPU::S_BARRIER_INIT_M0;
9873 case Intrinsic::amdgcn_s_barrier_join:
9874 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9876 case Intrinsic::amdgcn_s_wakeup_barrier:
9877 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9882 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9888 if (!IsInlinableBarID) {
9893 Op.getOperand(2), M0Val),
9897 }
else if (!IsInlinableBarID) {
9907 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9920std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
9927 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
9944 unsigned Overflow = ImmOffset & ~MaxImm;
9945 ImmOffset -= Overflow;
9946 if ((int32_t)Overflow < 0) {
9947 Overflow += ImmOffset;
9956 SDValue Ops[] = { N0, OverflowVal };
9971void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
9973 Align Alignment)
const {
9976 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
9979 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
9990 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
9992 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10009SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10012 return MaybePointer;
10028 SDValue NumRecords =
Op->getOperand(3);
10031 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10034 std::optional<uint32_t> ConstStride = std::nullopt;
10035 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10036 ConstStride = ConstNode->getZExtValue();
10038 SDValue NewHighHalf = Masked;
10039 if (!ConstStride || *ConstStride != 0) {
10042 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10049 NewHighHalf = DAG.
getNode(
ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10053 NewHighHalf, NumRecords, Flags);
10060SITargetLowering::handleByteShortBufferLoads(
SelectionDAG &DAG,
EVT LoadVT,
10081 if (VDataType == MVT::f16)
10085 Ops[1] = BufferStoreExt;
10090 M->getMemOperand());
10115SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10131 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10138 "unexpected vector extload");
10151 "unexpected fp extload");
10169 DCI.AddToWorklist(Cvt.
getNode());
10174 DCI.AddToWorklist(Cvt.
getNode());
10185 if (
Info.isEntryFunction())
10186 return Info.getUserSGPRInfo().hasFlatScratchInit();
10194 EVT MemVT =
Load->getMemoryVT();
10207 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10210 BasePtr, RealMemVT, MMO);
10240 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10241 "Custom lowering for non-i32 vectors hasn't been implemented.");
10244 unsigned AS =
Load->getAddressSpace();
10263 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10280 Alignment >=
Align(4) && NumElements < 32) {
10295 if (NumElements > 4)
10315 if (NumElements > 2)
10320 if (NumElements > 4)
10332 auto Flags =
Load->getMemOperand()->getFlags();
10334 Load->getAlign(), Flags, &
Fast) &&
10343 MemVT, *
Load->getMemOperand())) {
10353 EVT VT =
Op.getValueType();
10390 EVT VT =
Op.getValueType();
10393 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10400 if (!AllowInaccurateRcp && VT != MVT::f16)
10403 if (CLHS->isExactlyValue(1.0)) {
10420 if (CLHS->isExactlyValue(-1.0)) {
10429 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10443 EVT VT =
Op.getValueType();
10446 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10448 if (!AllowInaccurateDiv)
10469 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10482 return DAG.
getNode(Opcode, SL, VTList,
10491 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10504 return DAG.
getNode(Opcode, SL, VTList,
10510 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10511 return FastLowered;
10538 const APFloat K0Val(0x1p+96f);
10541 const APFloat K1Val(0x1p-32f);
10568 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10569 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10570 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10575 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10576 return FastLowered;
10583 Flags.setNoFPExcept(
true);
10600 DenominatorScaled, Flags);
10602 DenominatorScaled, Flags);
10604 using namespace AMDGPU::Hwreg;
10605 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10613 const bool HasDynamicDenormals =
10619 if (!PreservesDenormals) {
10627 if (HasDynamicDenormals) {
10631 SavedDenormMode =
SDValue(GetReg, 0);
10639 const SDValue EnableDenormValue =
10648 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10649 {EnableDenormValue,
BitField, Glue});
10662 ApproxRcp, One, NegDivScale0, Flags);
10665 ApproxRcp, Fma0, Flags);
10668 Fma1, Fma1, Flags);
10671 NumeratorScaled,
Mul, Flags);
10674 Fma2, Fma1,
Mul, Fma2, Flags);
10677 NumeratorScaled, Fma3, Flags);
10679 if (!PreservesDenormals) {
10686 Fma4.
getValue(1), DisableDenormValue,
10689 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10690 const SDValue DisableDenormValue =
10691 HasDynamicDenormals
10696 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10707 {Fma4, Fma1, Fma3, Scale},
Flags);
10713 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10714 return FastLowered;
10742 NegDivScale0,
Mul, DivScale1);
10774 Fma4, Fma3,
Mul, Scale);
10780 EVT VT =
Op.getValueType();
10782 if (VT == MVT::f32)
10783 return LowerFDIV32(
Op, DAG);
10785 if (VT == MVT::f64)
10786 return LowerFDIV64(
Op, DAG);
10788 if (VT == MVT::f16)
10789 return LowerFDIV16(
Op, DAG);
10798 EVT ResultExpVT =
Op->getValueType(1);
10799 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10829 if (VT == MVT::i1) {
10832 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10836 Store->getValue().getValueType().getScalarType() == MVT::i32);
10838 unsigned AS =
Store->getAddressSpace();
10857 if (NumElements > 4)
10864 VT, *
Store->getMemOperand()))
10873 if (NumElements > 2)
10877 if (NumElements > 4 ||
10886 auto Flags =
Store->getMemOperand()->getFlags();
10921 MVT VT =
Op.getValueType().getSimpleVT();
11090 EVT VT =
Op.getValueType();
11107 switch (
Op.getOpcode()) {
11133 EVT VT =
Op.getValueType();
11149 DAGCombinerInfo &DCI)
const {
11150 EVT VT =
N->getValueType(0);
11152 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11159 EVT SrcVT = Src.getValueType();
11165 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11168 DCI.AddToWorklist(Cvt.
getNode());
11171 if (ScalarVT != MVT::f32) {
11183 DAGCombinerInfo &DCI)
const {
11184 SDValue MagnitudeOp =
N->getOperand(0);
11185 SDValue SignOp =
N->getOperand(1);
11243 unsigned AddrSpace,
11245 DAGCombinerInfo &DCI)
const {
11275 AM.HasBaseReg =
true;
11276 AM.BaseOffs =
Offset.getSExtValue();
11281 EVT VT =
N->getValueType(0);
11287 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11298 switch (
N->getOpcode()) {
11309 DAGCombinerInfo &DCI)
const {
11318 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11319 N->getMemoryVT(), DCI);
11323 NewOps[PtrIdx] = NewPtr;
11332 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11333 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11342SDValue SITargetLowering::splitBinaryBitConstantOp(
11343 DAGCombinerInfo &DCI,
11365 if (V.getValueType() != MVT::i1)
11367 switch (V.getOpcode()) {
11386 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11387 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11388 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11389 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11390 uint32_t NonZeroByteMask = ~ZeroByteMask;
11391 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11404 assert(V.getValueSizeInBits() == 32);
11406 if (V.getNumOperands() != 2)
11415 switch (V.getOpcode()) {
11420 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11425 return (0x03020100 & ~ConstMask) | ConstMask;
11432 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11438 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11445 DAGCombinerInfo &DCI)
const {
11446 if (DCI.isBeforeLegalize())
11450 EVT VT =
N->getValueType(0);
11456 if (VT == MVT::i64 && CRHS) {
11462 if (CRHS && VT == MVT::i32) {
11471 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11472 unsigned Shift = CShift->getZExtValue();
11474 unsigned Offset = NB + Shift;
11475 if ((
Offset & (Bits - 1)) == 0) {
11478 LHS->getOperand(0),
11493 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11499 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11514 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11519 if (
X !=
LHS.getOperand(1))
11557 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11558 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11561 Mask->getZExtValue() & ~OrdMask :
11562 Mask->getZExtValue() & OrdMask;
11570 if (VT == MVT::i32 &&
11583 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11586 if (LHSMask != ~0u && RHSMask != ~0u) {
11589 if (LHSMask > RHSMask) {
11596 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11597 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11600 if (!(LHSUsedLanes & RHSUsedLanes) &&
11603 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11610 for (
unsigned I = 0;
I < 32;
I += 8) {
11612 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11613 Mask &= (0x0c <<
I) & 0xffffffff;
11622 LHS.getOperand(0),
RHS.getOperand(0),
11671static const std::optional<ByteProvider<SDValue>>
11673 unsigned Depth = 0) {
11676 return std::nullopt;
11678 if (
Op.getValueSizeInBits() < 8)
11679 return std::nullopt;
11681 if (
Op.getValueType().isVector())
11684 switch (
Op->getOpcode()) {
11695 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11696 NarrowVT = VTSign->getVT();
11699 return std::nullopt;
11702 if (SrcIndex >= NarrowByteWidth)
11703 return std::nullopt;
11709 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11711 return std::nullopt;
11713 uint64_t BitShift = ShiftOp->getZExtValue();
11715 if (BitShift % 8 != 0)
11716 return std::nullopt;
11718 SrcIndex += BitShift / 8;
11736static const std::optional<ByteProvider<SDValue>>
11738 unsigned StartingIndex = 0) {
11742 return std::nullopt;
11744 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11746 return std::nullopt;
11748 return std::nullopt;
11750 bool IsVec =
Op.getValueType().isVector();
11751 switch (
Op.getOpcode()) {
11754 return std::nullopt;
11759 return std::nullopt;
11763 return std::nullopt;
11766 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11767 return std::nullopt;
11768 if (!
LHS ||
LHS->isConstantZero())
11770 if (!
RHS ||
RHS->isConstantZero())
11772 return std::nullopt;
11777 return std::nullopt;
11779 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11781 return std::nullopt;
11783 uint32_t BitMask = BitMaskOp->getZExtValue();
11787 if ((IndexMask & BitMask) != IndexMask) {
11790 if (IndexMask & BitMask)
11791 return std::nullopt;
11800 return std::nullopt;
11803 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11804 if (!ShiftOp ||
Op.getValueType().isVector())
11805 return std::nullopt;
11807 uint64_t BitsProvided =
Op.getValueSizeInBits();
11808 if (BitsProvided % 8 != 0)
11809 return std::nullopt;
11811 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11813 return std::nullopt;
11815 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11816 uint64_t ByteShift = BitShift / 8;
11818 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11819 uint64_t BytesProvided = BitsProvided / 8;
11820 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11821 NewIndex %= BytesProvided;
11828 return std::nullopt;
11830 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11832 return std::nullopt;
11834 uint64_t BitShift = ShiftOp->getZExtValue();
11836 return std::nullopt;
11838 auto BitsProvided =
Op.getScalarValueSizeInBits();
11839 if (BitsProvided % 8 != 0)
11840 return std::nullopt;
11842 uint64_t BytesProvided = BitsProvided / 8;
11843 uint64_t ByteShift = BitShift / 8;
11848 return BytesProvided - ByteShift >
Index
11856 return std::nullopt;
11858 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11860 return std::nullopt;
11862 uint64_t BitShift = ShiftOp->getZExtValue();
11863 if (BitShift % 8 != 0)
11864 return std::nullopt;
11865 uint64_t ByteShift = BitShift / 8;
11871 return Index < ByteShift
11874 Depth + 1, StartingIndex);
11883 return std::nullopt;
11890 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11891 NarrowBitWidth = VTSign->getVT().getSizeInBits();
11893 if (NarrowBitWidth % 8 != 0)
11894 return std::nullopt;
11895 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11897 if (
Index >= NarrowByteWidth)
11899 ? std::optional<ByteProvider<SDValue>>(
11907 return std::nullopt;
11911 if (NarrowByteWidth >=
Index) {
11916 return std::nullopt;
11923 return std::nullopt;
11927 auto L = cast<LoadSDNode>(
Op.getNode());
11929 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
11930 if (NarrowBitWidth % 8 != 0)
11931 return std::nullopt;
11932 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
11937 if (
Index >= NarrowByteWidth) {
11939 ? std::optional<ByteProvider<SDValue>>(
11944 if (NarrowByteWidth >
Index) {
11948 return std::nullopt;
11953 return std::nullopt;
11956 Depth + 1, StartingIndex);
11960 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11962 return std::nullopt;
11963 auto VecIdx = IdxOp->getZExtValue();
11964 auto ScalarSize =
Op.getScalarValueSizeInBits();
11965 if (ScalarSize != 32) {
11966 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
11970 StartingIndex,
Index);
11975 return std::nullopt;
11977 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11979 return std::nullopt;
11982 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
11983 if (IdxMask > 0x07 && IdxMask != 0x0c)
11984 return std::nullopt;
11986 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
11987 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
11989 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
11995 return std::nullopt;
12010 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12014 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12017 auto MemVT = L->getMemoryVT();
12020 return L->getMemoryVT().getSizeInBits() == 16;
12030 int Low8 = Mask & 0xff;
12031 int Hi8 = (Mask & 0xff00) >> 8;
12033 assert(Low8 < 8 && Hi8 < 8);
12035 bool IsConsecutive = (Hi8 - Low8 == 1);
12040 bool Is16Aligned = !(Low8 % 2);
12042 return IsConsecutive && Is16Aligned;
12050 int Low16 = PermMask & 0xffff;
12051 int Hi16 = (PermMask & 0xffff0000) >> 16;
12061 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12063 if (!OtherOpIs16Bit)
12071 unsigned DWordOffset) {
12074 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12076 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12081 if (Src.getValueType().isVector()) {
12082 auto ScalarTySize = Src.getScalarValueSizeInBits();
12083 auto ScalarTy = Src.getValueType().getScalarType();
12084 if (ScalarTySize == 32) {
12088 if (ScalarTySize > 32) {
12091 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12092 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12099 assert(ScalarTySize < 32);
12100 auto NumElements =
TypeSize / ScalarTySize;
12101 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12102 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12103 auto NumElementsIn32 = 32 / ScalarTySize;
12104 auto NumAvailElements = DWordOffset < Trunc32Elements
12106 : NumElements - NormalizedTrunc;
12119 auto ShiftVal = 32 * DWordOffset;
12127 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12132 for (
int i = 0; i < 4; i++) {
12134 std::optional<ByteProvider<SDValue>>
P =
12137 if (!
P ||
P->isConstantZero())
12142 if (PermNodes.
size() != 4)
12145 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12146 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12148 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12149 auto PermOp = PermNodes[i];
12152 int SrcByteAdjust = 4;
12156 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12157 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12159 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12160 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12164 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12165 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12168 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12170 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12173 SDValue Op = *PermNodes[FirstSrc.first].Src;
12175 assert(
Op.getValueSizeInBits() == 32);
12179 int Low16 = PermMask & 0xffff;
12180 int Hi16 = (PermMask & 0xffff0000) >> 16;
12182 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12183 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12186 if (WellFormedLow && WellFormedHi)
12190 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12199 assert(
Op.getValueType().isByteSized() &&
12217 DAGCombinerInfo &DCI)
const {
12222 EVT VT =
N->getValueType(0);
12223 if (VT == MVT::i1) {
12228 if (Src !=
RHS.getOperand(0))
12233 if (!CLHS || !CRHS)
12237 static const uint32_t MaxMask = 0x3ff;
12251 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12256 Sel |=
LHS.getConstantOperandVal(2);
12265 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12269 auto usesCombinedOperand = [](
SDNode *OrUse) {
12272 !OrUse->getValueType(0).isVector())
12276 for (
auto VUse : OrUse->uses()) {
12277 if (!VUse->getValueType(0).isVector())
12284 if (VUse->getOpcode() == VectorwiseOp)
12290 if (!
any_of(
N->uses(), usesCombinedOperand))
12296 if (LHSMask != ~0u && RHSMask != ~0u) {
12299 if (LHSMask > RHSMask) {
12306 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12307 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12310 if (!(LHSUsedLanes & RHSUsedLanes) &&
12313 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12315 LHSMask &= ~RHSUsedLanes;
12316 RHSMask &= ~LHSUsedLanes;
12318 LHSMask |= LHSUsedLanes & 0x04040404;
12324 LHS.getOperand(0),
RHS.getOperand(0),
12328 if (LHSMask == ~0u || RHSMask == ~0u) {
12334 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12349 if (SrcVT == MVT::i32) {
12355 DCI.AddToWorklist(LowOr.
getNode());
12356 DCI.AddToWorklist(HiBits.
getNode());
12364 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12368 N->getOperand(0), CRHS))
12376 DAGCombinerInfo &DCI)
const {
12377 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12386 EVT VT =
N->getValueType(0);
12387 if (CRHS && VT == MVT::i64) {
12409 LHS->getOperand(0), FNegLHS, FNegRHS);
12418 DAGCombinerInfo &DCI)
const {
12423 EVT VT =
N->getValueType(0);
12424 if (VT != MVT::i32)
12428 if (Src.getValueType() != MVT::i16)
12435SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12436 DAGCombinerInfo &DCI)
const {
12438 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12443 VTSign->getVT() == MVT::i8) ||
12445 VTSign->getVT() == MVT::i16))) {
12447 "s_buffer_load_{u8, i8} are supported "
12448 "in GFX12 (or newer) architectures.");
12449 EVT VT = Src.getValueType();
12454 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12460 auto *
M = cast<MemSDNode>(Src);
12461 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12462 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12466 VTSign->getVT() == MVT::i8) ||
12468 VTSign->getVT() == MVT::i16)) &&
12470 auto *
M = cast<MemSDNode>(Src);
12482 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12483 Src.getOperand(0).getValueType());
12486 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12488 Ops,
M->getMemoryVT(),
12489 M->getMemOperand());
12490 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12497 DAGCombinerInfo &DCI)
const {
12505 if (
N->getOperand(0).isUndef())
12512 DAGCombinerInfo &DCI)
const {
12513 EVT VT =
N->getValueType(0);
12517 return DCI.DAG.getConstantFP(
12540 unsigned Opcode =
Op.getOpcode();
12544 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12545 const auto &
F = CFP->getValueAPF();
12546 if (
F.isNaN() &&
F.isSignaling())
12548 if (!
F.isDenormal())
12611 if (
Op.getValueType() == MVT::i32) {
12616 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12617 if (
RHS->getZExtValue() == 0xffff0000) {
12627 return Op.getValueType().getScalarType() != MVT::f16;
12695 if (
Op.getValueType() == MVT::i16) {
12706 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12708 switch (IntrinsicID) {
12709 case Intrinsic::amdgcn_cvt_pkrtz:
12710 case Intrinsic::amdgcn_cubeid:
12711 case Intrinsic::amdgcn_frexp_mant:
12712 case Intrinsic::amdgcn_fdot2:
12713 case Intrinsic::amdgcn_rcp:
12714 case Intrinsic::amdgcn_rsq:
12715 case Intrinsic::amdgcn_rsq_clamp:
12716 case Intrinsic::amdgcn_rcp_legacy:
12717 case Intrinsic::amdgcn_rsq_legacy:
12718 case Intrinsic::amdgcn_trig_preop:
12719 case Intrinsic::amdgcn_log:
12720 case Intrinsic::amdgcn_exp2:
12721 case Intrinsic::amdgcn_sqrt:
12742 unsigned Opcode =
MI->getOpcode();
12744 if (Opcode == AMDGPU::G_FCANONICALIZE)
12747 std::optional<FPValueAndVReg> FCR;
12750 if (FCR->Value.isSignaling())
12752 if (!FCR->Value.isDenormal())
12763 case AMDGPU::G_FADD:
12764 case AMDGPU::G_FSUB:
12765 case AMDGPU::G_FMUL:
12766 case AMDGPU::G_FCEIL:
12767 case AMDGPU::G_FFLOOR:
12768 case AMDGPU::G_FRINT:
12769 case AMDGPU::G_FNEARBYINT:
12770 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12771 case AMDGPU::G_INTRINSIC_TRUNC:
12772 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12773 case AMDGPU::G_FMA:
12774 case AMDGPU::G_FMAD:
12775 case AMDGPU::G_FSQRT:
12776 case AMDGPU::G_FDIV:
12777 case AMDGPU::G_FREM:
12778 case AMDGPU::G_FPOW:
12779 case AMDGPU::G_FPEXT:
12780 case AMDGPU::G_FLOG:
12781 case AMDGPU::G_FLOG2:
12782 case AMDGPU::G_FLOG10:
12783 case AMDGPU::G_FPTRUNC:
12784 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12785 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12786 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12787 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12788 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12790 case AMDGPU::G_FNEG:
12791 case AMDGPU::G_FABS:
12792 case AMDGPU::G_FCOPYSIGN:
12794 case AMDGPU::G_FMINNUM:
12795 case AMDGPU::G_FMAXNUM:
12796 case AMDGPU::G_FMINNUM_IEEE:
12797 case AMDGPU::G_FMAXNUM_IEEE:
12798 case AMDGPU::G_FMINIMUM:
12799 case AMDGPU::G_FMAXIMUM: {
12807 case AMDGPU::G_BUILD_VECTOR:
12812 case AMDGPU::G_INTRINSIC:
12813 case AMDGPU::G_INTRINSIC_CONVERGENT:
12815 case Intrinsic::amdgcn_fmul_legacy:
12816 case Intrinsic::amdgcn_fmad_ftz:
12817 case Intrinsic::amdgcn_sqrt:
12818 case Intrinsic::amdgcn_fmed3:
12819 case Intrinsic::amdgcn_sin:
12820 case Intrinsic::amdgcn_cos:
12821 case Intrinsic::amdgcn_log:
12822 case Intrinsic::amdgcn_exp2:
12823 case Intrinsic::amdgcn_log_clamp:
12824 case Intrinsic::amdgcn_rcp:
12825 case Intrinsic::amdgcn_rcp_legacy:
12826 case Intrinsic::amdgcn_rsq:
12827 case Intrinsic::amdgcn_rsq_clamp:
12828 case Intrinsic::amdgcn_rsq_legacy:
12829 case Intrinsic::amdgcn_div_scale:
12830 case Intrinsic::amdgcn_div_fmas:
12831 case Intrinsic::amdgcn_div_fixup:
12832 case Intrinsic::amdgcn_fract:
12833 case Intrinsic::amdgcn_cvt_pkrtz:
12834 case Intrinsic::amdgcn_cubeid:
12835 case Intrinsic::amdgcn_cubema:
12836 case Intrinsic::amdgcn_cubesc:
12837 case Intrinsic::amdgcn_cubetc:
12838 case Intrinsic::amdgcn_frexp_mant:
12839 case Intrinsic::amdgcn_fdot2:
12840 case Intrinsic::amdgcn_trig_preop:
12855SDValue SITargetLowering::getCanonicalConstantFP(
12858 if (
C.isDenormal()) {
12872 if (
C.isSignaling()) {
12891 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
12894SDValue SITargetLowering::performFCanonicalizeCombine(
12896 DAGCombinerInfo &DCI)
const {
12899 EVT VT =
N->getValueType(0);
12908 EVT VT =
N->getValueType(0);
12909 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
12925 EVT EltVT =
Lo.getValueType();
12928 for (
unsigned I = 0;
I != 2; ++
I) {
12931 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
12932 CFP->getValueAPF());
12933 }
else if (
Op.isUndef()) {
12945 if (isa<ConstantFPSDNode>(NewElts[1]))
12946 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
12951 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13002 if (!MinK || !MaxK)
13015 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13016 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13058 if (
Info->getMode().DX10Clamp) {
13067 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13089 DAGCombinerInfo &DCI)
const {
13092 EVT VT =
N->getValueType(0);
13093 unsigned Opc =
N->getOpcode();
13102 (VT == MVT::i32 || VT == MVT::f32 ||
13103 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->
hasMin3Max3_16()))) {
13110 N->getValueType(0),
13123 N->getValueType(0),
13133 if (
SDValue Med3 = performIntMed3ImmCombine(
13138 if (
SDValue Med3 = performIntMed3ImmCombine(
13144 if (
SDValue Med3 = performIntMed3ImmCombine(
13149 if (
SDValue Med3 = performIntMed3ImmCombine(
13159 (VT == MVT::f32 || VT == MVT::f64 ||
13163 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13174 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13175 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13184 DAGCombinerInfo &DCI)
const {
13185 EVT VT =
N->getValueType(0);
13208 if (
Info->getMode().DX10Clamp) {
13211 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13214 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13217 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13228 DAGCombinerInfo &DCI)
const {
13232 return DCI.DAG.getUNDEF(
N->getValueType(0));
13240 bool IsDivergentIdx,
13245 unsigned VecSize = EltSize * NumElem;
13248 if (VecSize <= 64 && EltSize < 32)
13257 if (IsDivergentIdx)
13261 unsigned NumInsts = NumElem +
13262 ((EltSize + 31) / 32) * NumElem ;
13267 return NumInsts <= 16;
13271 return NumInsts <= 15;
13276 if (isa<ConstantSDNode>(
Idx))
13289SDValue SITargetLowering::performExtractVectorEltCombine(
13290 SDNode *
N, DAGCombinerInfo &DCI)
const {
13296 EVT ResVT =
N->getValueType(0);
13315 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13343 DCI.AddToWorklist(Elt0.
getNode());
13344 DCI.AddToWorklist(Elt1.
getNode());
13366 if (!DCI.isBeforeLegalize())
13372 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13373 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13374 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13377 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13378 unsigned EltIdx = BitIndex / 32;
13379 unsigned LeftoverBitIdx = BitIndex % 32;
13383 DCI.AddToWorklist(Cast.
getNode());
13387 DCI.AddToWorklist(Elt.
getNode());
13390 DCI.AddToWorklist(Srl.
getNode());
13394 DCI.AddToWorklist(Trunc.
getNode());
13396 if (VecEltVT == ResVT) {
13408SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13409 DAGCombinerInfo &DCI)
const {
13423 EVT IdxVT =
Idx.getValueType();
13440 Src.getOperand(0).getValueType() == MVT::f16) {
13441 return Src.getOperand(0);
13444 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13445 APFloat Val = CFP->getValueAPF();
13446 bool LosesInfo =
true;
13456 DAGCombinerInfo &DCI)
const {
13458 "combine only useful on gfx8");
13460 SDValue TruncSrc =
N->getOperand(0);
13461 EVT VT =
N->getValueType(0);
13462 if (VT != MVT::f16)
13500unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13502 const SDNode *N1)
const {
13507 if (((VT == MVT::f32 &&
13509 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13529 EVT VT =
N->getValueType(0);
13530 if (VT != MVT::i32 && VT != MVT::i64)
13536 unsigned Opc =
N->getOpcode();
13559 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13581 DAGCombinerInfo &DCI)
const {
13585 EVT VT =
N->getValueType(0);
13595 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13599 if (NumBits <= 32 || NumBits > 64)
13611 unsigned NumUsers = 0;
13636 bool MulSignedLo =
false;
13637 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13646 if (VT != MVT::i64) {
13669 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13671 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13673 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13675 if (!MulLHSUnsigned32) {
13682 if (!MulRHSUnsigned32) {
13693 if (VT != MVT::i64)
13700static std::optional<ByteProvider<SDValue>>
13703 if (!Byte0 || Byte0->isConstantZero()) {
13704 return std::nullopt;
13707 if (Byte1 && !Byte1->isConstantZero()) {
13708 return std::nullopt;
13714 unsigned FirstCs =
First & 0x0c0c0c0c;
13715 unsigned SecondCs = Second & 0x0c0c0c0c;
13716 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13717 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13719 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13720 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13721 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13722 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13724 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13748 for (
int BPI = 0; BPI < 2; BPI++) {
13751 BPP = {Src1, Src0};
13753 unsigned ZeroMask = 0x0c0c0c0c;
13754 unsigned FMask = 0xFF << (8 * (3 - Step));
13756 unsigned FirstMask =
13757 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13758 unsigned SecondMask =
13759 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13763 int FirstGroup = -1;
13764 for (
int I = 0;
I < 2;
I++) {
13766 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
13767 return IterElt.SrcOp == *BPP.first.Src &&
13768 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13778 if (FirstGroup != -1) {
13780 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
13781 return IterElt.SrcOp == *BPP.second.Src &&
13782 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13788 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13796 unsigned ZeroMask = 0x0c0c0c0c;
13797 unsigned FMask = 0xFF << (8 * (3 - Step));
13801 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13805 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13816 if (Srcs.
size() == 1) {
13817 auto Elt = Srcs.
begin();
13821 if (Elt->PermMask == 0x3020100)
13828 auto FirstElt = Srcs.
begin();
13829 auto SecondElt = std::next(FirstElt);
13836 auto FirstMask = FirstElt->PermMask;
13837 auto SecondMask = SecondElt->PermMask;
13839 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13840 unsigned FirstPlusFour = FirstMask | 0x04040404;
13843 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13855 FirstElt = std::next(SecondElt);
13856 if (FirstElt == Srcs.
end())
13859 SecondElt = std::next(FirstElt);
13862 if (SecondElt == Srcs.
end()) {
13868 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
13874 return Perms.
size() == 2
13880 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13881 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13882 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13883 EntryMask += ZeroMask;
13888 auto Opcode =
Op.getOpcode();
13894static std::optional<bool>
13905 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
13908 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
13910 assert(!(S0IsUnsigned && S0IsSigned));
13911 assert(!(S1IsUnsigned && S1IsSigned));
13919 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
13925 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
13926 return std::nullopt;
13938 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
13939 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
13944 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
13950 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
13951 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
13952 return std::nullopt;
13958 DAGCombinerInfo &DCI)
const {
13960 EVT VT =
N->getValueType(0);
13967 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
13972 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
13979 std::optional<bool> IsSigned;
13985 int ChainLength = 0;
13986 for (
int I = 0;
I < 4;
I++) {
13987 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
13990 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
13993 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
13998 TempNode->getOperand(MulIdx), *Src0, *Src1,
13999 TempNode->getOperand(MulIdx)->getOperand(0),
14000 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14004 IsSigned = *IterIsSigned;
14005 if (*IterIsSigned != *IsSigned)
14008 auto AddIdx = 1 - MulIdx;
14011 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14012 Src2s.
push_back(TempNode->getOperand(AddIdx));
14022 TempNode->getOperand(AddIdx), *Src0, *Src1,
14023 TempNode->getOperand(AddIdx)->getOperand(0),
14024 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14028 if (*IterIsSigned != *IsSigned)
14032 ChainLength =
I + 2;
14036 TempNode = TempNode->getOperand(AddIdx);
14038 ChainLength =
I + 1;
14039 if (TempNode->getNumOperands() < 2)
14041 LHS = TempNode->getOperand(0);
14042 RHS = TempNode->getOperand(1);
14045 if (ChainLength < 2)
14051 if (ChainLength < 4) {
14061 bool UseOriginalSrc =
false;
14062 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14063 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14064 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14065 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14067 auto Src0Mask = Src0s.
begin()->PermMask;
14068 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14069 bool UniqueEntries =
true;
14070 for (
auto I = 1;
I < 4;
I++) {
14071 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14074 UniqueEntries =
false;
14080 if (UniqueEntries) {
14081 UseOriginalSrc =
true;
14083 auto FirstElt = Src0s.
begin();
14087 auto SecondElt = Src1s.
begin();
14089 SecondElt->DWordOffset);
14098 if (!UseOriginalSrc) {
14105 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14108 : Intrinsic::amdgcn_udot4,
14118 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14123 unsigned Opc =
LHS.getOpcode();
14128 Opc =
RHS.getOpcode();
14134 auto Cond =
RHS.getOperand(0);
14142 return DAG.
getNode(Opc, SL, VTList, Args);
14156 DAGCombinerInfo &DCI)
const {
14158 EVT VT =
N->getValueType(0);
14160 if (VT != MVT::i32)
14169 unsigned Opc =
RHS.getOpcode();
14175 auto Cond =
RHS.getOperand(0);
14183 return DAG.
getNode(Opc, SL, VTList, Args);
14197SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14198 DAGCombinerInfo &DCI)
const {
14200 if (
N->getValueType(0) != MVT::i32)
14211 unsigned LHSOpc =
LHS.getOpcode();
14212 unsigned Opc =
N->getOpcode();
14222 DAGCombinerInfo &DCI)
const {
14227 EVT VT =
N->getValueType(0);
14239 if (
A ==
LHS.getOperand(1)) {
14240 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14241 if (FusedOp != 0) {
14243 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14251 if (
A ==
RHS.getOperand(1)) {
14252 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14253 if (FusedOp != 0) {
14255 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14264 DAGCombinerInfo &DCI)
const {
14270 EVT VT =
N->getValueType(0);
14283 if (
A ==
LHS.getOperand(1)) {
14284 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14289 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14298 if (
A ==
RHS.getOperand(1)) {
14299 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14302 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14311 DAGCombinerInfo &DCI)
const {
14314 EVT VT =
N->getValueType(0);
14328 bool IsNegative =
false;
14329 if (CLHS->isExactlyValue(1.0) ||
14330 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14346 DAGCombinerInfo &DCI)
const {
14348 EVT VT =
N->getValueType(0);
14370 (
N->getFlags().hasAllowContract() &&
14371 FMA->getFlags().hasAllowContract())) {
14405 if (Vec1 == Vec2 || Vec3 == Vec4)
14411 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14412 (Vec1 == Vec4 && Vec2 == Vec3)) {
14421 DAGCombinerInfo &DCI)
const {
14427 EVT VT =
LHS.getValueType();
14430 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14432 CRHS = dyn_cast<ConstantSDNode>(LHS);
14456 return LHS.getOperand(0);
14462 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14463 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14464 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14471 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14472 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14480 return LHS.getOperand(0);
14484 if (VT != MVT::f32 && VT != MVT::f64 &&
14517 DAGCombinerInfo &DCI)
const {
14535 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14539 unsigned ShiftOffset = 8 *
Offset;
14541 ShiftOffset -=
C->getZExtValue();
14543 ShiftOffset +=
C->getZExtValue();
14545 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14547 MVT::f32, Shifted);
14558 DCI.AddToWorklist(
N);
14565 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14571 DAGCombinerInfo &DCI)
const {
14581 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14584 APFloat One(
F.getSemantics(),
"1.0");
14586 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14596 switch (
N->getOpcode()) {
14598 return performAddCombine(
N, DCI);
14600 return performSubCombine(
N, DCI);
14603 return performAddCarrySubCarryCombine(
N, DCI);
14605 return performFAddCombine(
N, DCI);
14607 return performFSubCombine(
N, DCI);
14609 return performFDivCombine(
N, DCI);
14611 return performSetCCCombine(
N, DCI);
14624 return performMinMaxCombine(
N, DCI);
14626 return performFMACombine(
N, DCI);
14628 return performAndCombine(
N, DCI);
14630 return performOrCombine(
N, DCI);
14633 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14634 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14640 return performXorCombine(
N, DCI);
14642 return performZeroExtendCombine(
N, DCI);
14644 return performSignExtendInRegCombine(
N , DCI);
14646 return performClassCombine(
N, DCI);
14648 return performFCanonicalizeCombine(
N, DCI);
14650 return performRcpCombine(
N, DCI);
14665 return performUCharToFloatCombine(
N, DCI);
14667 return performFCopySignCombine(
N, DCI);
14672 return performCvtF32UByteNCombine(
N, DCI);
14674 return performFMed3Combine(
N, DCI);
14676 return performCvtPkRTZCombine(
N, DCI);
14678 return performClampCombine(
N, DCI);
14681 EVT VT =
N->getValueType(0);
14684 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14687 EVT EltVT = Src.getValueType();
14688 if (EltVT != MVT::i16)
14698 return performExtractVectorEltCombine(
N, DCI);
14700 return performInsertVectorEltCombine(
N, DCI);
14702 return performFPRoundCombine(
N, DCI);
14704 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14710 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14711 return performMemSDNodeCombine(MemNode, DCI);
14724 default:
return ~0u;
14725 case AMDGPU::sub0:
return 0;
14726 case AMDGPU::sub1:
return 1;
14727 case AMDGPU::sub2:
return 2;
14728 case AMDGPU::sub3:
return 3;
14729 case AMDGPU::sub4:
return 4;
14736 unsigned Opcode =
Node->getMachineOpcode();
14740 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14746 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14747 unsigned NewDmask = 0;
14750 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14751 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14754 unsigned TFCLane = 0;
14755 bool HasChain =
Node->getNumValues() > 1;
14757 if (OldDmask == 0) {
14765 TFCLane = OldBitsSet;
14773 if (
I.getUse().getResNo() != 0)
14777 if (!
I->isMachineOpcode() ||
14778 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14790 if (UsesTFC && Lane == TFCLane) {
14795 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14797 Dmask &= ~(1 << Comp);
14805 NewDmask |= 1 << Comp;
14810 bool NoChannels = !NewDmask;
14817 if (OldBitsSet == 1)
14823 if (NewDmask == OldDmask)
14832 unsigned NewChannels = BitsSet + UsesTFC;
14836 assert(NewOpcode != -1 &&
14837 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14838 "failed to find equivalent MIMG op");
14846 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14848 MVT ResultVT = NewChannels == 1 ?
14850 NewChannels == 5 ? 8 : NewChannels);
14864 if (NewChannels == 1) {
14874 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14879 if (i || !NoChannels)
14884 if (NewUser !=
User) {
14892 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
14893 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
14894 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
14895 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
14905 Op =
Op.getOperand(0);
14907 return isa<FrameIndexSDNode>(
Op);
14916 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
14917 SDValue SrcVal = Node->getOperand(2);
14925 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
14927 SDNode *Glued = Node->getGluedNode();
14929 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
14936 return ToResultReg.
getNode();
14941 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
14949 Node->getOperand(i).getValueType(),
14950 Node->getOperand(i)), 0));
14961 unsigned Opcode = Node->getMachineOpcode();
14963 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
14964 !
TII->isGather4(Opcode) &&
14966 return adjustWritemask(Node, DAG);
14969 if (Opcode == AMDGPU::INSERT_SUBREG ||
14970 Opcode == AMDGPU::REG_SEQUENCE) {
14976 case AMDGPU::V_DIV_SCALE_F32_e64:
14977 case AMDGPU::V_DIV_SCALE_F64_e64: {
14981 SDValue Src0 = Node->getOperand(1);
14982 SDValue Src1 = Node->getOperand(3);
14983 SDValue Src2 = Node->getOperand(5);
14987 (Src0 == Src1 || Src0 == Src2))
15049 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15050 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15051 unsigned D16Val = D16 ? D16->getImm() : 0;
15053 if (!TFEVal && !LWEVal)
15069 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15071 unsigned dmask = MO_Dmask->
getImm();
15079 D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15084 uint32_t DstSize =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15085 if (DstSize < InitIdx)
15089 Register PrevDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15090 unsigned NewDst = 0;
15099 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15100 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15118 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15131 if (
TII->isVOP3(
MI.getOpcode())) {
15133 TII->legalizeOperandsVOP3(
MRI,
MI);
15138 if (!
MI.getDesc().operands().empty()) {
15139 unsigned Opc =
MI.getOpcode();
15140 bool HasAGPRs =
Info->mayNeedAGPRs();
15148 if ((
I == Src2Idx) && (HasAGPRs))
15151 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15153 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15154 if (!
TRI->hasAGPRs(RC))
15156 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15157 if (!Src || !Src->isCopy() ||
15158 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15160 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15164 MRI.setRegClass(
Op.getReg(), NewRC);
15171 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15172 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15173 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15174 if (
TRI->isVectorSuperClass(RC)) {
15175 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15176 MRI.setRegClass(Src2->getReg(), NewRC);
15177 if (Src2->isTied())
15178 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15187 if (
TII->isImage(
MI)) {
15188 if (!
MI.mayStore())
15190 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15217 MVT::v2i32, Ops0), 0);
15247 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15269std::pair<unsigned, const TargetRegisterClass *>
15276 if (Constraint.
size() == 1) {
15278 switch (Constraint[0]) {
15285 RC = &AMDGPU::SReg_32RegClass;
15288 RC = &AMDGPU::SGPR_64RegClass;
15293 return std::pair(0U,
nullptr);
15300 RC = &AMDGPU::VGPR_32RegClass;
15305 return std::pair(0U,
nullptr);
15314 RC = &AMDGPU::AGPR_32RegClass;
15319 return std::pair(0U,
nullptr);
15328 return std::pair(0U, RC);
15333 if (
RegName.consume_front(
"v")) {
15334 RC = &AMDGPU::VGPR_32RegClass;
15335 }
else if (
RegName.consume_front(
"s")) {
15336 RC = &AMDGPU::SGPR_32RegClass;
15337 }
else if (
RegName.consume_front(
"a")) {
15338 RC = &AMDGPU::AGPR_32RegClass;
15343 if (
RegName.consume_front(
"[")) {
15353 RC =
TRI->getVGPRClassForBitWidth(Width);
15355 RC =
TRI->getSGPRClassForBitWidth(Width);
15357 RC =
TRI->getAGPRClassForBitWidth(Width);
15359 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15360 return std::pair(Reg, RC);
15365 if (!
Failed && Idx < RC->getNumRegs())
15373 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15379 if (Constraint.
size() == 1) {
15380 switch (Constraint[0]) {
15389 }
else if (Constraint ==
"DA" ||
15390 Constraint ==
"DB") {
15398 if (Constraint.
size() == 1) {
15399 switch (Constraint[0]) {
15415 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15422 std::vector<SDValue> &Ops,
15437 unsigned Size =
Op.getScalarValueSizeInBits();
15445 Val =
C->getSExtValue();
15449 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15455 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15458 Val =
C->getSExtValue();
15462 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15472 if (Constraint.
size() == 1) {
15473 switch (Constraint[0]) {
15477 return isInt<16>(Val);
15481 return isInt<32>(Val);
15488 }
else if (Constraint.
size() == 2) {
15489 if (Constraint ==
"DA") {
15490 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15491 int64_t LoBits =
static_cast<int32_t
>(Val);
15495 if (Constraint ==
"DB") {
15503 unsigned MaxSize)
const {
15504 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15507 MVT VT =
Op.getSimpleValueType();
15532 switch (UnalignedClassID) {
15533 case AMDGPU::VReg_64RegClassID:
15534 return AMDGPU::VReg_64_Align2RegClassID;
15535 case AMDGPU::VReg_96RegClassID:
15536 return AMDGPU::VReg_96_Align2RegClassID;
15537 case AMDGPU::VReg_128RegClassID:
15538 return AMDGPU::VReg_128_Align2RegClassID;
15539 case AMDGPU::VReg_160RegClassID:
15540 return AMDGPU::VReg_160_Align2RegClassID;
15541 case AMDGPU::VReg_192RegClassID:
15542 return AMDGPU::VReg_192_Align2RegClassID;
15543 case AMDGPU::VReg_224RegClassID:
15544 return AMDGPU::VReg_224_Align2RegClassID;
15545 case AMDGPU::VReg_256RegClassID:
15546 return AMDGPU::VReg_256_Align2RegClassID;
15547 case AMDGPU::VReg_288RegClassID:
15548 return AMDGPU::VReg_288_Align2RegClassID;
15549 case AMDGPU::VReg_320RegClassID:
15550 return AMDGPU::VReg_320_Align2RegClassID;
15551 case AMDGPU::VReg_352RegClassID:
15552 return AMDGPU::VReg_352_Align2RegClassID;
15553 case AMDGPU::VReg_384RegClassID:
15554 return AMDGPU::VReg_384_Align2RegClassID;
15555 case AMDGPU::VReg_512RegClassID:
15556 return AMDGPU::VReg_512_Align2RegClassID;
15557 case AMDGPU::VReg_1024RegClassID:
15558 return AMDGPU::VReg_1024_Align2RegClassID;
15559 case AMDGPU::AReg_64RegClassID:
15560 return AMDGPU::AReg_64_Align2RegClassID;
15561 case AMDGPU::AReg_96RegClassID:
15562 return AMDGPU::AReg_96_Align2RegClassID;
15563 case AMDGPU::AReg_128RegClassID:
15564 return AMDGPU::AReg_128_Align2RegClassID;
15565 case AMDGPU::AReg_160RegClassID:
15566 return AMDGPU::AReg_160_Align2RegClassID;
15567 case AMDGPU::AReg_192RegClassID:
15568 return AMDGPU::AReg_192_Align2RegClassID;
15569 case AMDGPU::AReg_256RegClassID:
15570 return AMDGPU::AReg_256_Align2RegClassID;
15571 case AMDGPU::AReg_512RegClassID:
15572 return AMDGPU::AReg_512_Align2RegClassID;
15573 case AMDGPU::AReg_1024RegClassID:
15574 return AMDGPU::AReg_1024_Align2RegClassID;
15590 if (
Info->isEntryFunction()) {
15597 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15599 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15600 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15601 &AMDGPU::SGPR_64RegClass);
15602 Info->setSGPRForEXECCopy(SReg);
15605 Info->getStackPtrOffsetReg()));
15606 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15607 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15611 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15612 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15614 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15615 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15617 Info->limitOccupancy(MF);
15619 if (ST.isWave32() && !MF.
empty()) {
15620 for (
auto &
MBB : MF) {
15621 for (
auto &
MI :
MBB) {
15622 TII->fixImplicitOperands(
MI);
15632 if (ST.needsAlignedVGPRs()) {
15633 for (
unsigned I = 0,
E =
MRI.getNumVirtRegs();
I !=
E; ++
I) {
15639 if (NewClassID != -1)
15640 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15649 const APInt &DemandedElts,
15651 unsigned Depth)
const {
15653 unsigned Opc =
Op.getOpcode();
15656 unsigned IID =
Op.getConstantOperandVal(0);
15658 case Intrinsic::amdgcn_mbcnt_lo:
15659 case Intrinsic::amdgcn_mbcnt_hi: {
15666 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15668 MaxActiveBits += Src1ValBits ? 1 : 0;
15669 unsigned Size =
Op.getValueType().getSizeInBits();
15670 if (MaxActiveBits <
Size)
15679 Op, Known, DemandedElts, DAG,
Depth);
15694 unsigned MaxValue =
15703 switch (
MI->getOpcode()) {
15704 case AMDGPU::G_INTRINSIC:
15705 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15707 case Intrinsic::amdgcn_workitem_id_x:
15710 case Intrinsic::amdgcn_workitem_id_y:
15713 case Intrinsic::amdgcn_workitem_id_z:
15716 case Intrinsic::amdgcn_mbcnt_lo:
15717 case Intrinsic::amdgcn_mbcnt_hi: {
15719 unsigned Size =
MRI.getType(R).getSizeInBits();
15723 case Intrinsic::amdgcn_groupstaticsize: {
15734 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15737 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15740 case AMDGPU::G_AMDGPU_SMED3:
15741 case AMDGPU::G_AMDGPU_UMED3: {
15742 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15769 unsigned Depth)
const {
15771 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15777 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15804 if (Header->getAlignment() != PrefAlign)
15805 return Header->getAlignment();
15807 unsigned LoopSize = 0;
15815 LoopSize +=
TII->getInstSizeInBytes(
MI);
15816 if (LoopSize > 192)
15821 if (LoopSize <= 64)
15824 if (LoopSize <= 128)
15825 return CacheLineAlign;
15831 auto I = Exit->getFirstNonDebugInstr();
15832 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15833 return CacheLineAlign;
15842 if (PreTerm == Pre->
begin() ||
15843 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15847 auto ExitHead = Exit->getFirstNonDebugInstr();
15848 if (ExitHead == Exit->end() ||
15849 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15854 return CacheLineAlign;
15862 N =
N->getOperand(0).getNode();
15873 switch (
N->getOpcode()) {
15881 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15882 return !
TRI->isSGPRReg(
MRI, Reg);
15888 return !
TRI->isSGPRReg(
MRI, Reg);
15892 unsigned AS = L->getAddressSpace();
15926 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
15928 return A->readMem() &&
A->writeMem();
15963 unsigned Depth)
const {
15968 if (
Info->getMode().DX10Clamp)
15992 return F->getFnAttribute(
"amdgpu-unsafe-fp-atomics").getValueAsString() !=
16014 <<
"Hardware instruction generated for atomic "
16016 <<
" operation at memory scope " << MemScope
16017 <<
" due to an unsafe request.";
16022 bool HasSystemScope =
16046 if (HasSystemScope)
16117 if (HasSystemScope)
16154 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16156 : &AMDGPU::SReg_32RegClass;
16157 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16158 return TRI->getEquivalentSGPRClass(RC);
16159 else if (
TRI->isSGPRClass(RC) && isDivergent)
16160 return TRI->getEquivalentVGPRClass(RC);
16172 unsigned WaveSize) {
16177 if (!
IT ||
IT->getBitWidth() != WaveSize)
16180 if (!isa<Instruction>(V))
16182 if (!Visited.
insert(V).second)
16184 bool Result =
false;
16185 for (
const auto *U : V->users()) {
16186 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16187 if (V == U->getOperand(1)) {
16188 switch (Intrinsic->getIntrinsicID()) {
16192 case Intrinsic::amdgcn_if_break:
16193 case Intrinsic::amdgcn_if:
16194 case Intrinsic::amdgcn_else:
16199 if (V == U->getOperand(0)) {
16200 switch (Intrinsic->getIntrinsicID()) {
16204 case Intrinsic::amdgcn_end_cf:
16205 case Intrinsic::amdgcn_loop:
16211 Result =
hasCFUser(U, Visited, WaveSize);
16220 const Value *V)
const {
16221 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16222 if (CI->isInlineAsm()) {
16231 for (
auto &TC : TargetConstraints) {
16235 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16248 for (;
I !=
E; ++
I) {
16249 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16272 return MRI.hasOneNonDBGUse(N0);
16279 if (
I.getMetadata(
"amdgpu.noclobber"))
16281 if (
I.getMetadata(
"amdgpu.last.use"))
16291 if (!Def->isMachineOpcode())
16302 PhysReg = AMDGPU::SCC;
16304 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16313 "target should have atomic fadd instructions");
16316 "generic atomicrmw expansion only supports FP32 operand in flat "
16319 "only fadd is supported for now");
16391 for (
auto &
P : MDs)
16402 {
Addr},
nullptr,
"is.shared");
16403 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16408 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16413 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16419 Value *LoadedPrivate =
16420 Builder.
CreateLoad(ValTy, CastToPrivate,
"loaded.private");
16428 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
bool hasD16Images() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasLDSFPAtomicAdd() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
bool hasScalarDwordx3Loads() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
LLVMContext & getContext() const
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
bool isCompare() const
Return true if this instruction is a comparison.
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
void setImplicit(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void AddIMGInit(MachineInstr &MI) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ BUFFER_ATOMIC_FADD_BF16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const