LLVM 22.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
81
82static AsmPrinter *
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
95
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
103 return "AMDGPU Assembly Printer";
104}
105
107 return TM.getMCSubtargetInfo();
108}
109
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
130 return;
131
133
136 CodeObjectVersion);
137 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
138 }
139
142}
143
145 // Init target streamer if it has not yet happened
147 initTargetStreamer(M);
148
149 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
154 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
163 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
170 STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(*F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
206 KernelCode.validate(&STM, MF->getContext());
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
212}
213
215 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216 if (!MFI.isEntryFunction())
217 return;
218
219 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(&ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
233 ReadOnlySection.ensureMinAlignment(Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(KernelName, &MF->getFunction());
240 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
241 CurrentProgramInfo.NumVGPRsForWavesPerEU,
243 CurrentProgramInfo.NumSGPRsForWavesPerEU,
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
247 Context),
248 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
254 Register RegNo = MI->getOperand(0).getReg();
255
257 raw_svector_ostream OS(Str);
258 OS << "implicit-def: "
259 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(OS.str());
265 OutStreamer->addBlankLine();
266}
267
269 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
271 return;
272 }
273
274 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(SymbolName, &MF->getFunction()),
280 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
286 HexLines.emplace_back("");
287 }
288
290}
291
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
299 HexLines.emplace_back("");
300 }
302}
303
306 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
307 OutContext.reportError({},
308 Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
314 const Triple::OSType OS = TM.getTargetTriple().getOS();
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
326 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
327 Align Alignment = GV->getAlign().value_or(Align(4));
328
329 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(GVSym, Size, Alignment);
333 return;
334 }
335
337}
338
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343 switch (CodeObjectVersion) {
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError("unsupported code object version");
355 }
356 }
357
359}
360
361/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
362///
363/// Remove dependency on GCNSubtarget and depend only only the necessary values
364/// for said occupancy computation. Should match computeOccupancy implementation
365/// without passing \p STM on.
366const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
367 const MCExpr *NumVGPRs,
368 unsigned DynamicVGPRBlockSize,
369 const GCNSubtarget &STM, MCContext &Ctx) {
370 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
371 unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
372 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
373 unsigned Generation = STM.getGeneration();
374
375 auto CreateExpr = [&Ctx](unsigned Value) {
376 return MCConstantExpr::create(Value, Ctx);
377 };
378
380 {CreateExpr(MaxWaves), CreateExpr(Granule),
381 CreateExpr(TargetTotalNumVGPRs),
382 CreateExpr(Generation), CreateExpr(InitOcc),
383 NumSGPRs, NumVGPRs},
384 Ctx);
385}
386
387void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
388 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
389 return;
390
392 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
393 MCSymbol *FnSym = TM.getSymbol(&F);
394 bool IsLocal = F.hasLocalLinkage();
395
396 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
397 int64_t Val;
398 if (Value->evaluateAsAbsolute(Val)) {
399 Res = Val;
400 return true;
401 }
402 return false;
403 };
404
405 const uint64_t MaxScratchPerWorkitem =
407 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
408 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
409 uint64_t ScratchSize;
410 if (ScratchSizeSymbol->isVariable() &&
411 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
412 ScratchSize > MaxScratchPerWorkitem) {
413 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
414 DS_Error);
415 F.getContext().diagnose(DiagStackSize);
416 }
417
418 // Validate addressable scalar registers (i.e., prior to added implicit
419 // SGPRs).
420 MCSymbol *NumSGPRSymbol =
421 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
423 !STM.hasSGPRInitBug()) {
424 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
425 uint64_t NumSgpr;
426 if (NumSGPRSymbol->isVariable() &&
427 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
428 NumSgpr > MaxAddressableNumSGPRs) {
429 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
430 NumSgpr, MaxAddressableNumSGPRs,
432 F.getContext().diagnose(Diag);
433 return;
434 }
435 }
436
437 MCSymbol *VCCUsedSymbol =
438 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
439 MCSymbol *FlatUsedSymbol = RI.getSymbol(
440 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
441 uint64_t VCCUsed, FlatUsed, NumSgpr;
442
443 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
444 FlatUsedSymbol->isVariable() &&
445 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
446 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
447 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
448
449 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
450 // resolvable.
451 NumSgpr += IsaInfo::getNumExtraSGPRs(
452 &STM, VCCUsed, FlatUsed,
453 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
455 STM.hasSGPRInitBug()) {
456 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
457 if (NumSgpr > MaxAddressableNumSGPRs) {
458 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
459 MaxAddressableNumSGPRs, DS_Error,
461 F.getContext().diagnose(Diag);
462 return;
463 }
464 }
465
466 MCSymbol *NumVgprSymbol =
467 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
468 MCSymbol *NumAgprSymbol =
469 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
470 uint64_t NumVgpr, NumAgpr;
471
472 MachineModuleInfo &MMI =
474 MachineFunction *MF = MMI.getMachineFunction(F);
475 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
476 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
477 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
478 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
479 unsigned MaxWaves = MFI.getMaxWavesPerEU();
480 uint64_t TotalNumVgpr =
481 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
482 uint64_t NumVGPRsForWavesPerEU =
483 std::max({TotalNumVgpr, (uint64_t)1,
484 (uint64_t)STM.getMinNumVGPRs(
485 MaxWaves, MFI.getDynamicVGPRBlockSize())});
486 uint64_t NumSGPRsForWavesPerEU = std::max(
487 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
488 const MCExpr *OccupancyExpr = createOccupancy(
489 STM.getOccupancyWithWorkGroupSizes(*MF).second,
490 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
491 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
493 uint64_t Occupancy;
494
495 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
496 F, "amdgpu-waves-per-eu", {0, 0}, true);
497
498 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
499 DiagnosticInfoOptimizationFailure Diag(
500 F, F.getSubprogram(),
501 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
502 "'" +
503 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
504 ", final occupancy is " + Twine(Occupancy));
505 F.getContext().diagnose(Diag);
506 return;
507 }
508 }
509 }
510}
511
513 // Pad with s_code_end to help tools and guard against instruction prefetch
514 // causing stale data in caches. Arguably this should be done by the linker,
515 // which is why this isn't done for Mesa.
516 // Don't do it if there is no code.
517 const MCSubtargetInfo &STI = *getGlobalSTI();
518 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
522 if (TextSect->hasInstructions()) {
523 OutStreamer->switchSection(TextSect);
525 }
526 }
527
528 // Assign expressions which can only be resolved when all other functions are
529 // known.
530 RI.finalize(OutContext);
531
532 // Switch section and emit all GPR maximums within the processed module.
533 OutStreamer->pushSection();
534 MCSectionELF *MaxGPRSection =
535 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
536 OutStreamer->switchSection(MaxGPRSection);
538 RI.getMaxAGPRSymbol(OutContext),
539 RI.getMaxSGPRSymbol(OutContext));
540 OutStreamer->popSection();
541
542 for (Function &F : M.functions())
543 validateMCResourceInfo(F);
544
545 RI.reset();
546
548}
549
550SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
552 raw_svector_ostream OSS(Str);
553 auto &Streamer = getTargetStreamer()->getStreamer();
554 auto &Context = Streamer.getContext();
555 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
556 printAMDGPUMCExpr(New, OSS, MAI);
557 return Str;
558}
559
560// Print comments that apply to both callable functions and entry points.
561void AMDGPUAsmPrinter::emitCommonFunctionComments(
562 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
563 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
564 const AMDGPUMachineFunction *MFI) {
565 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
566 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
567 false);
568 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
569 if (NumAGPR && TotalNumVGPR) {
570 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
571 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
572 false);
573 }
574 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
575 false);
576 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
577 false);
578}
579
580const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
581 const MachineFunction &MF) const {
582 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
583 MCContext &Ctx = MF.getContext();
584 uint16_t KernelCodeProperties = 0;
585 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
586
587 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
588 KernelCodeProperties |=
589 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
590 }
591 if (UserSGPRInfo.hasDispatchPtr()) {
592 KernelCodeProperties |=
593 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
594 }
595 if (UserSGPRInfo.hasQueuePtr()) {
596 KernelCodeProperties |=
597 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
598 }
599 if (UserSGPRInfo.hasKernargSegmentPtr()) {
600 KernelCodeProperties |=
601 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
602 }
603 if (UserSGPRInfo.hasDispatchID()) {
604 KernelCodeProperties |=
605 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
606 }
607 if (UserSGPRInfo.hasFlatScratchInit()) {
608 KernelCodeProperties |=
609 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
610 }
611 if (UserSGPRInfo.hasPrivateSegmentSize()) {
612 KernelCodeProperties |=
613 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
614 }
615 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
616 KernelCodeProperties |=
617 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
618 }
619
620 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
621 // un-evaluatable at this point so it cannot be conditionally checked here.
622 // Instead, we'll directly shift the possibly unknown MCExpr into its place
623 // and bitwise-or it into KernelCodeProperties.
624 const MCExpr *KernelCodePropExpr =
625 MCConstantExpr::create(KernelCodeProperties, Ctx);
626 const MCExpr *OrValue = MCConstantExpr::create(
627 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
628 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
629 OrValue, Ctx);
630 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
631
632 return KernelCodePropExpr;
633}
634
635MCKernelDescriptor
636AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
637 const SIProgramInfo &PI) const {
638 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
639 const Function &F = MF.getFunction();
640 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
641 MCContext &Ctx = MF.getContext();
642
643 MCKernelDescriptor KernelDescriptor;
644
645 KernelDescriptor.group_segment_fixed_size =
647 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
648
649 Align MaxKernArgAlign;
650 KernelDescriptor.kernarg_size = MCConstantExpr::create(
651 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
652
653 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
654 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
655 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
656
657 int64_t PGRM_Rsrc3 = 1;
658 bool EvaluatableRsrc3 =
659 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
660 (void)PGRM_Rsrc3;
661 (void)EvaluatableRsrc3;
663 STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
664 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
665 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
666
667 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
668 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
669 Ctx);
670
671 return KernelDescriptor;
672}
673
675 // Init target streamer lazily on the first function so that previous passes
676 // can set metadata.
678 initTargetStreamer(*MF.getFunction().getParent());
679
680 ResourceUsage =
682 CurrentProgramInfo.reset(MF);
683
684 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
685 MCContext &Ctx = MF.getContext();
686
687 // The starting address of all shader programs must be 256 bytes aligned.
688 // Regular functions just need the basic required instruction alignment.
689 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
690
692
693 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
695 bool IsLocal = MF.getFunction().hasLocalLinkage();
696 // FIXME: This should be an explicit check for Mesa.
697 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
698 MCSectionELF *ConfigSection =
699 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
700 OutStreamer->switchSection(ConfigSection);
701 }
702
703 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
704
705 if (MFI->isModuleEntryFunction()) {
706 getSIProgramInfo(CurrentProgramInfo, MF);
707 }
708
709 if (STM.isAmdPalOS()) {
710 if (MFI->isEntryFunction())
711 EmitPALMetadata(MF, CurrentProgramInfo);
712 else if (MFI->isModuleEntryFunction())
713 emitPALFunctionMetadata(MF);
714 } else if (!STM.isAmdHsaOS()) {
715 EmitProgramInfoSI(MF, CurrentProgramInfo);
716 }
717
718 DumpCodeInstEmitter = nullptr;
719 if (STM.dumpCode()) {
720 // For -dumpcode, get the assembler out of the streamer. This only works
721 // with -filetype=obj.
722 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
723 if (Assembler)
724 DumpCodeInstEmitter = Assembler->getEmitterPtr();
725 }
726
727 DisasmLines.clear();
728 HexLines.clear();
730
732
733 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
734 STM.hasMAIInsts());
735
736 {
739 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
740 IsLocal),
741 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
742 IsLocal),
743 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
744 IsLocal),
745 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
746 OutContext, IsLocal),
747 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
748 OutContext, IsLocal),
749 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
750 IsLocal),
751 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
752 OutContext, IsLocal),
753 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
754 OutContext, IsLocal),
755 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
756 IsLocal),
757 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
758 OutContext, IsLocal));
759 }
760
761 // Emit _dvgpr$ symbol when appropriate.
762 emitDVgprSymbol(MF);
763
764 if (isVerbose()) {
765 MCSectionELF *CommentSection =
766 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
767 OutStreamer->switchSection(CommentSection);
768
769 if (!MFI->isEntryFunction()) {
771 OutStreamer->emitRawComment(" Function info:", false);
772
773 emitCommonFunctionComments(
774 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
775 IsLocal)
776 ->getVariableValue(),
777 STM.hasMAIInsts()
778 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
779 OutContext, IsLocal)
780 ->getVariableValue()
781 : nullptr,
782 RI.createTotalNumVGPRs(MF, Ctx),
783 RI.createTotalNumSGPRs(
784 MF,
785 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
786 Ctx),
787 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
788 OutContext, IsLocal)
789 ->getVariableValue(),
790 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
791 return false;
792 }
793
794 OutStreamer->emitRawComment(" Kernel info:", false);
795 emitCommonFunctionComments(
796 CurrentProgramInfo.NumArchVGPR,
797 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
798 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
799 CurrentProgramInfo.ScratchSize,
800 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
801
802 OutStreamer->emitRawComment(
803 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
804 OutStreamer->emitRawComment(
805 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
806 OutStreamer->emitRawComment(
807 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
808 " bytes/workgroup (compile time only)", false);
809
810 OutStreamer->emitRawComment(
811 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
812
813 OutStreamer->emitRawComment(
814 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
815
816 OutStreamer->emitRawComment(
817 " NumSGPRsForWavesPerEU: " +
818 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
819 false);
820 OutStreamer->emitRawComment(
821 " NumVGPRsForWavesPerEU: " +
822 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
823 false);
824
825 if (STM.hasGFX90AInsts()) {
826 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
827 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
828 AdjustedAccum = MCBinaryExpr::createMul(
829 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
830 OutStreamer->emitRawComment(
831 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
832 }
833
834 if (AMDGPU::isGFX1250(STM))
835 OutStreamer->emitRawComment(
836 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
837 false);
838
839 OutStreamer->emitRawComment(
840 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
841
842 OutStreamer->emitRawComment(
843 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
844
845 OutStreamer->emitRawComment(
846 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
847 getMCExprStr(CurrentProgramInfo.ScratchEnable),
848 false);
849 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
850 Twine(CurrentProgramInfo.UserSGPR),
851 false);
852 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
853 Twine(CurrentProgramInfo.TrapHandlerEnable),
854 false);
855 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
856 Twine(CurrentProgramInfo.TGIdXEnable),
857 false);
858 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
859 Twine(CurrentProgramInfo.TGIdYEnable),
860 false);
861 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
862 Twine(CurrentProgramInfo.TGIdZEnable),
863 false);
864 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
865 Twine(CurrentProgramInfo.TIdIGCompCount),
866 false);
867
868 [[maybe_unused]] int64_t PGMRSrc3;
870 STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
871 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
872 static_cast<uint64_t>(PGMRSrc3) == 0));
873 if (STM.hasGFX90AInsts()) {
874 OutStreamer->emitRawComment(
875 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
876 getMCExprStr(MCKernelDescriptor::bits_get(
877 CurrentProgramInfo.ComputePGMRSrc3,
878 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
879 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
880 false);
881 OutStreamer->emitRawComment(
882 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
883 getMCExprStr(MCKernelDescriptor::bits_get(
884 CurrentProgramInfo.ComputePGMRSrc3,
885 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
886 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
887 false);
888 }
889 }
890
891 if (DumpCodeInstEmitter) {
892
893 OutStreamer->switchSection(
894 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
895
896 for (size_t i = 0; i < DisasmLines.size(); ++i) {
897 std::string Comment = "\n";
898 if (!HexLines[i].empty()) {
899 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
900 Comment += " ; " + HexLines[i] + "\n";
901 }
902
903 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
904 OutStreamer->emitBytes(StringRef(Comment));
905 }
906 }
907
908 return false;
909}
910
911// When appropriate, add a _dvgpr$ symbol, with the value of the function
912// symbol, plus an offset encoding one less than the number of VGPR blocks used
913// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
914// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
915// used by a front-end to have functions that are chained rather than called,
916// and a dispatcher that dynamically resizes the VGPR count before dispatching
917// to a function.
918void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
920 if (MFI.isDynamicVGPREnabled() &&
922 MCContext &Ctx = MF.getContext();
923 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
924 MCValue NumVGPRs;
925 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
926 NumVGPRs, nullptr) ||
927 !NumVGPRs.isAbsolute()) {
928 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
929 }
930 // Calculate number of VGPR blocks.
931 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
932 unsigned NumBlocks =
933 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
934
935 if (NumBlocks > 8) {
937 "too many DVGPR blocks for _dvgpr$ symbol for '" +
938 Twine(CurrentFnSym->getName()) + "'");
939 return;
940 }
941 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
942 // Add to function symbol to create _dvgpr$ symbol.
943 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
945 MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
946 MCSymbol *DVgprFuncSym =
947 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
948 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
949 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
950 emitLinkage(&MF.getFunction(), DVgprFuncSym);
951 }
952}
953
954// TODO: Fold this into emitFunctionBodyStart.
955void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
956 // In the beginning all features are either 'Any' or 'NotSupported',
957 // depending on global target features. This will cover empty modules.
959 getGlobalSTI()->getFeatureString());
960
961 // If module is empty, we are done.
962 if (M.empty())
963 return;
964
965 // If module is not empty, need to find first 'Off' or 'On' feature
966 // setting per feature from functions in module.
967 for (auto &F : M) {
968 auto &TSTargetID = getTargetStreamer()->getTargetID();
969 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
970 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
971 break;
972
973 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
974 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
975 if (TSTargetID->isXnackSupported())
976 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
977 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
978 if (TSTargetID->isSramEccSupported())
979 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
980 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
981 }
982}
983
984// AccumOffset computed for the MCExpr equivalent of:
985// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
986static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
987 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
988 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
989
990 // Can't be lower than 1 for subsequent alignTo.
991 const MCExpr *MaximumTaken =
992 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
993
994 // Practically, it's computing divideCeil(MaximumTaken, 4).
995 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
996 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
997 Ctx);
998
999 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
1000}
1001
1002void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1003 const MachineFunction &MF) {
1004 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1005 bool IsLocal = MF.getFunction().hasLocalLinkage();
1006 MCContext &Ctx = MF.getContext();
1007
1008 auto CreateExpr = [&Ctx](int64_t Value) {
1009 return MCConstantExpr::create(Value, Ctx);
1010 };
1011
1012 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1013 int64_t Val;
1014 if (Value->evaluateAsAbsolute(Val)) {
1015 Res = Val;
1016 return true;
1017 }
1018 return false;
1019 };
1020
1021 auto GetSymRefExpr =
1022 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1023 MCSymbol *Sym =
1024 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
1025 return MCSymbolRefExpr::create(Sym, Ctx);
1026 };
1027
1029 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1030 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1032 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1033
1034 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1035 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1036 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1037 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1038 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1039 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1040 ProgInfo.DynamicCallStack =
1041 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1042 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1043
1044 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1045 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1046 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1047 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1048
1049 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1050
1051 // The calculations related to SGPR/VGPR blocks are
1052 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1053 // unified.
1054 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1055 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1056 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1057
1058 // Check the addressable register limit before we add ExtraSGPRs.
1060 !STM.hasSGPRInitBug()) {
1061 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1062 uint64_t NumSgpr;
1063 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1064 NumSgpr > MaxAddressableNumSGPRs) {
1065 // This can happen due to a compiler bug or when using inline asm.
1066 LLVMContext &Ctx = MF.getFunction().getContext();
1067 DiagnosticInfoResourceLimit Diag(
1068 MF.getFunction(), "addressable scalar registers", NumSgpr,
1069 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
1070 Ctx.diagnose(Diag);
1071 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1072 }
1073 }
1074
1075 // Account for extra SGPRs and VGPRs reserved for debugger use.
1076 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1077
1078 const Function &F = MF.getFunction();
1079
1080 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1081 // dispatch registers as function args.
1082 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1083 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1084
1085 if (WaveDispatchNumSGPR) {
1087 {ProgInfo.NumSGPR,
1088 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1089 Ctx)},
1090 Ctx);
1091 }
1092
1093 if (WaveDispatchNumVGPR) {
1095 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1096
1098 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1099 }
1100
1101 // Adjust number of registers used to meet default/requested minimum/maximum
1102 // number of waves per execution unit request.
1103 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1104 ProgInfo.NumSGPRsForWavesPerEU =
1105 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1106 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1107 Ctx);
1108 ProgInfo.NumVGPRsForWavesPerEU =
1109 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1110 CreateExpr(STM.getMinNumVGPRs(
1111 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1112 Ctx);
1113
1115 STM.hasSGPRInitBug()) {
1116 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1117 uint64_t NumSgpr;
1118 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1119 NumSgpr > MaxAddressableNumSGPRs) {
1120 // This can happen due to a compiler bug or when using inline asm to use
1121 // the registers which are usually reserved for vcc etc.
1122 LLVMContext &Ctx = MF.getFunction().getContext();
1123 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1124 NumSgpr, MaxAddressableNumSGPRs,
1126 Ctx.diagnose(Diag);
1127 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1128 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1129 }
1130 }
1131
1132 if (STM.hasSGPRInitBug()) {
1133 ProgInfo.NumSGPR =
1135 ProgInfo.NumSGPRsForWavesPerEU =
1137 }
1138
1139 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1140 LLVMContext &Ctx = MF.getFunction().getContext();
1141 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1142 MFI->getNumUserSGPRs(),
1144 Ctx.diagnose(Diag);
1145 }
1146
1147 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1148 LLVMContext &Ctx = MF.getFunction().getContext();
1149 DiagnosticInfoResourceLimit Diag(
1150 MF.getFunction(), "local memory", MFI->getLDSSize(),
1152 Ctx.diagnose(Diag);
1153 }
1154 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1155 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1156 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1157 unsigned Granule) {
1158 const MCExpr *OneConst = CreateExpr(1ul);
1159 const MCExpr *GranuleConst = CreateExpr(Granule);
1160 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1161 const MCExpr *AlignToGPR =
1162 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1163 const MCExpr *DivGPR =
1164 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1165 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1166 return SubGPR;
1167 };
1168 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1170 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1171 } else {
1172 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1174 }
1175 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1177
1178 const SIModeRegisterDefaults Mode = MFI->getMode();
1179
1180 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1181 // register.
1182 ProgInfo.FloatMode = getFPMode(Mode);
1183
1184 ProgInfo.IEEEMode = Mode.IEEE;
1185
1186 // Make clamp modifier on NaN input returns 0.
1187 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1188
1189 unsigned LDSAlignShift = 8;
1190 switch (getLdsDwGranularity(STM)) {
1191 case 512:
1192 case 320:
1193 LDSAlignShift = 11;
1194 break;
1195 case 128:
1196 LDSAlignShift = 9;
1197 break;
1198 case 64:
1199 LDSAlignShift = 8;
1200 break;
1201 default:
1202 llvm_unreachable("invald LDS block size");
1203 }
1204
1205 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1206 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1207
1208 ProgInfo.LDSSize = MFI->getLDSSize();
1209 ProgInfo.LDSBlocks =
1210 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1211
1212 // The MCExpr equivalent of divideCeil.
1213 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1214 const MCExpr *Ceil =
1215 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1216 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1217 };
1218
1219 // Scratch is allocated in 64-dword or 256-dword blocks.
1220 unsigned ScratchAlignShift =
1221 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1222 // We need to program the hardware with the amount of scratch memory that
1223 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1224 // scratch memory used per thread.
1225 ProgInfo.ScratchBlocks = DivideCeil(
1227 CreateExpr(STM.getWavefrontSize()), Ctx),
1228 CreateExpr(1ULL << ScratchAlignShift));
1229
1230 if (STM.supportsWGP()) {
1231 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1232 }
1233
1234 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1235 ProgInfo.MemOrdered = 1;
1236 ProgInfo.FwdProgress = 1;
1237 }
1238
1239 // 0 = X, 1 = XY, 2 = XYZ
1240 unsigned TIDIGCompCnt = 0;
1241 if (MFI->hasWorkItemIDZ())
1242 TIDIGCompCnt = 2;
1243 else if (MFI->hasWorkItemIDY())
1244 TIDIGCompCnt = 1;
1245
1246 // The private segment wave byte offset is the last of the system SGPRs. We
1247 // initially assumed it was allocated, and may have used it. It shouldn't harm
1248 // anything to disable it if we know the stack isn't used here. We may still
1249 // have emitted code reading it to initialize scratch, but if that's unused
1250 // reading garbage should be OK.
1253 MCConstantExpr::create(0, Ctx), Ctx),
1254 ProgInfo.DynamicCallStack, Ctx);
1255
1256 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1257 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1258 ProgInfo.TrapHandlerEnable =
1259 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1260 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1261 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1262 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1263 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1264 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1265 ProgInfo.EXCPEnMSB = 0;
1266 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1267 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1268 ProgInfo.EXCPEnable = 0;
1269
1270 // return ((Dst & ~Mask) | (Value << Shift))
1271 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1272 uint32_t Shift) {
1273 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1274 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1275 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1277 Ctx);
1278 return Dst;
1279 };
1280
1281 if (STM.hasGFX90AInsts()) {
1282 ProgInfo.ComputePGMRSrc3 =
1283 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1284 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1285 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1286 ProgInfo.ComputePGMRSrc3 =
1287 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1288 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1289 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1290 }
1291
1292 if (AMDGPU::isGFX1250(STM))
1293 ProgInfo.ComputePGMRSrc3 =
1294 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1295 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1296 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1297
1298 ProgInfo.Occupancy = createOccupancy(
1299 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1301 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1302
1303 const auto [MinWEU, MaxWEU] =
1304 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1305 uint64_t Occupancy;
1306 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1307 DiagnosticInfoOptimizationFailure Diag(
1308 F, F.getSubprogram(),
1309 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1310 "'" +
1311 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1312 ", final occupancy is " + Twine(Occupancy));
1313 F.getContext().diagnose(Diag);
1314 }
1315
1316 if (isGFX11Plus(STM)) {
1317 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1318 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1319 (uint64_t)std::numeric_limits<uint32_t>::max());
1320 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1321 uint32_t Field, Shift, Width;
1322 if (isGFX11(STM)) {
1323 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1324 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1325 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1326 } else {
1327 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1328 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1329 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1330 }
1331 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1332 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1333 CreateExpr(InstPrefSize), Field, Shift);
1334 }
1335}
1336
1349
1350void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1351 const SIProgramInfo &CurrentProgramInfo) {
1352 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1353 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1354 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1355 MCContext &Ctx = MF.getContext();
1356
1357 // (((Value) & Mask) << Shift)
1358 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1359 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1360 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1362 shft, Ctx);
1363 };
1364
1365 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1366 int64_t Val;
1367 if (Value->evaluateAsAbsolute(Val))
1368 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1369 else
1370 OutStreamer->emitValue(Value, Size);
1371 };
1372
1373 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1375
1376 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1377 /*Size=*/4);
1378
1380 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1381
1383
1384 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1385 // appropriate generation.
1386 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1387 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1388 /*Mask=*/0x3FFFF, /*Shift=*/12),
1389 /*Size=*/4);
1390 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1391 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1392 /*Mask=*/0x7FFF, /*Shift=*/12),
1393 /*Size=*/4);
1394 } else {
1395 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1396 /*Mask=*/0x1FFF, /*Shift=*/12),
1397 /*Size=*/4);
1398 }
1399
1400 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1401 // 0" comment but I don't see a corresponding field in the register spec.
1402 } else {
1403 OutStreamer->emitInt32(RsrcReg);
1404
1405 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1406 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1407 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1408 MF.getContext());
1409 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1411
1412 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1413 // appropriate generation.
1414 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1415 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1416 /*Mask=*/0x3FFFF, /*Shift=*/12),
1417 /*Size=*/4);
1418 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1419 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1420 /*Mask=*/0x7FFF, /*Shift=*/12),
1421 /*Size=*/4);
1422 } else {
1423 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1424 /*Mask=*/0x1FFF, /*Shift=*/12),
1425 /*Size=*/4);
1426 }
1427 }
1428
1429 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1431 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1432 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1433 : CurrentProgramInfo.LDSBlocks;
1434 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1436 OutStreamer->emitInt32(MFI->getPSInputEnable());
1438 OutStreamer->emitInt32(MFI->getPSInputAddr());
1439 }
1440
1441 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1442 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1443 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1444 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1445}
1446
1447// Helper function to add common PAL Metadata 3.0+
1449 const SIProgramInfo &CurrentProgramInfo,
1450 CallingConv::ID CC, const GCNSubtarget &ST,
1451 unsigned DynamicVGPRBlockSize) {
1452 if (ST.hasIEEEMode())
1453 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1454
1455 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1456 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1457 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1458
1459 if (AMDGPU::isCompute(CC)) {
1460 MD->setHwStage(CC, ".trap_present",
1461 (bool)CurrentProgramInfo.TrapHandlerEnable);
1462 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1463
1464 if (DynamicVGPRBlockSize != 0)
1465 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1466 }
1467
1469 CC, ".lds_size",
1470 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1471 sizeof(uint32_t)));
1472}
1473
1474// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1475// is AMDPAL. It stores each compute/SPI register setting and other PAL
1476// metadata items into the PALMD::Metadata, combining with any provided by the
1477// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1478// is then written as a single block in the .note section.
1479void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1480 const SIProgramInfo &CurrentProgramInfo) {
1481 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1482 auto CC = MF.getFunction().getCallingConv();
1483 auto *MD = getTargetStreamer()->getPALMetadata();
1484 auto &Ctx = MF.getContext();
1485
1486 MD->setEntryPoint(CC, MF.getFunction().getName());
1487 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1488
1489 // For targets that support dynamic VGPRs, set the number of saved dynamic
1490 // VGPRs (if any) in the PAL metadata.
1491 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1492 if (MFI->isDynamicVGPREnabled() &&
1494 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1496
1497 // Only set AGPRs for supported devices
1498 if (STM.hasMAIInsts()) {
1499 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1500 }
1501
1502 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1503 if (MD->getPALMajorVersion() < 3) {
1504 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1505 if (AMDGPU::isCompute(CC)) {
1506 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1507 } else {
1508 const MCExpr *HasScratchBlocks =
1509 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1510 MCConstantExpr::create(0, Ctx), Ctx);
1511 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1512 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1513 }
1514 } else {
1515 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1516 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1517 CurrentProgramInfo.ScratchEnable);
1518 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1520 }
1521
1522 // ScratchSize is in bytes, 16 aligned.
1523 MD->setScratchSize(
1524 CC,
1525 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1526 MCConstantExpr::create(16, Ctx), Ctx),
1527 Ctx);
1528
1529 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1530 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1531 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1532 : CurrentProgramInfo.LDSBlocks;
1533 if (MD->getPALMajorVersion() < 3) {
1534 MD->setRsrc2(
1535 CC,
1537 Ctx);
1538 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1539 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1540 } else {
1541 // Graphics registers
1542 const unsigned ExtraLdsDwGranularity =
1543 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1544 MD->setGraphicsRegisters(
1545 ".ps_extra_lds_size",
1546 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1547
1548 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1549 static StringLiteral const PsInputFields[] = {
1550 ".persp_sample_ena", ".persp_center_ena",
1551 ".persp_centroid_ena", ".persp_pull_model_ena",
1552 ".linear_sample_ena", ".linear_center_ena",
1553 ".linear_centroid_ena", ".line_stipple_tex_ena",
1554 ".pos_x_float_ena", ".pos_y_float_ena",
1555 ".pos_z_float_ena", ".pos_w_float_ena",
1556 ".front_face_ena", ".ancillary_ena",
1557 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1558 unsigned PSInputEna = MFI->getPSInputEnable();
1559 unsigned PSInputAddr = MFI->getPSInputAddr();
1560 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1561 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1562 (bool)((PSInputEna >> Idx) & 1));
1563 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1564 (bool)((PSInputAddr >> Idx) & 1));
1565 }
1566 }
1567 }
1568
1569 // For version 3 and above the wave front size is already set in the metadata
1570 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1571 MD->setWave32(MF.getFunction().getCallingConv());
1572}
1573
1574void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1575 auto *MD = getTargetStreamer()->getPALMetadata();
1576 const MachineFrameInfo &MFI = MF.getFrameInfo();
1577 StringRef FnName = MF.getFunction().getName();
1578 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1579 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1580 MCContext &Ctx = MF.getContext();
1581
1582 if (MD->getPALMajorVersion() < 3) {
1583 // Set compute registers
1584 MD->setRsrc1(
1586 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1587 MD->setRsrc2(CallingConv::AMDGPU_CS,
1588 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1589 } else {
1591 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1592 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1593 }
1594
1595 // Set optional info
1596 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1597 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1598 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1599}
1600
1601// This is supposed to be log2(Size)
1603 switch (Size) {
1604 case 4:
1605 return AMD_ELEMENT_4_BYTES;
1606 case 8:
1607 return AMD_ELEMENT_8_BYTES;
1608 case 16:
1609 return AMD_ELEMENT_16_BYTES;
1610 default:
1611 llvm_unreachable("invalid private_element_size");
1612 }
1613}
1614
1615void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1616 const SIProgramInfo &CurrentProgramInfo,
1617 const MachineFunction &MF) const {
1618 const Function &F = MF.getFunction();
1619 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1620 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1621
1622 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1623 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1624 MCContext &Ctx = MF.getContext();
1625
1626 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1627
1629 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1631 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1633
1634 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1635
1637 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1638
1639 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1640 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1642 }
1643
1644 if (UserSGPRInfo.hasDispatchPtr())
1646
1647 if (UserSGPRInfo.hasQueuePtr())
1649
1650 if (UserSGPRInfo.hasKernargSegmentPtr())
1652
1653 if (UserSGPRInfo.hasDispatchID())
1655
1656 if (UserSGPRInfo.hasFlatScratchInit())
1658
1659 if (UserSGPRInfo.hasPrivateSegmentSize())
1661
1662 if (STM.isXNACKEnabled())
1664
1665 Align MaxKernArgAlign;
1666 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1667 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1668 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1669 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1670 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1671
1672 // kernarg_segment_alignment is specified as log of the alignment.
1673 // The minimum alignment is 16.
1674 // FIXME: The metadata treats the minimum as 4?
1675 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1676}
1677
1679 const char *ExtraCode, raw_ostream &O) {
1680 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1681 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1682 return false;
1683
1684 if (ExtraCode && ExtraCode[0]) {
1685 if (ExtraCode[1] != 0)
1686 return true; // Unknown modifier.
1687
1688 switch (ExtraCode[0]) {
1689 case 'r':
1690 break;
1691 default:
1692 return true;
1693 }
1694 }
1695
1696 // TODO: Should be able to support other operand types like globals.
1697 const MachineOperand &MO = MI->getOperand(OpNo);
1698 if (MO.isReg()) {
1700 *MF->getSubtarget().getRegisterInfo());
1701 return false;
1702 }
1703 if (MO.isImm()) {
1704 int64_t Val = MO.getImm();
1706 O << Val;
1707 } else if (isUInt<16>(Val)) {
1708 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1709 } else if (isUInt<32>(Val)) {
1710 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1711 } else {
1712 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1713 }
1714 return false;
1715 }
1716 return true;
1717}
1718
1726
1727void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1728 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1729 bool isModuleEntryFunction, bool hasMAIInsts) {
1730 if (!ORE)
1731 return;
1732
1733 const char *Name = "kernel-resource-usage";
1734 const char *Indent = " ";
1735
1736 // If the remark is not specifically enabled, do not output to yaml
1738 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1739 return;
1740
1741 // Currently non-kernel functions have no resources to emit.
1743 return;
1744
1745 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1746 StringRef RemarkLabel, auto Argument) {
1747 // Add an indent for every line besides the line with the kernel name. This
1748 // makes it easier to tell which resource usage go with which kernel since
1749 // the kernel name will always be displayed first.
1750 std::string LabelStr = RemarkLabel.str() + ": ";
1751 if (RemarkName != "FunctionName")
1752 LabelStr = Indent + LabelStr;
1753
1754 ORE->emit([&]() {
1755 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1757 &MF.front())
1758 << LabelStr << ore::NV(RemarkName, Argument);
1759 });
1760 };
1761
1762 // FIXME: Formatting here is pretty nasty because clang does not accept
1763 // newlines from diagnostics. This forces us to emit multiple diagnostic
1764 // remarks to simulate newlines. If and when clang does accept newlines, this
1765 // formatting should be aggregated into one remark with newlines to avoid
1766 // printing multiple diagnostic location and diag opts.
1767 EmitResourceUsageRemark("FunctionName", "Function Name",
1768 MF.getFunction().getName());
1769 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1770 getMCExprStr(CurrentProgramInfo.NumSGPR));
1771 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1772 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1773 if (hasMAIInsts) {
1774 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1775 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1776 }
1777 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1778 getMCExprStr(CurrentProgramInfo.ScratchSize));
1779 int64_t DynStack;
1780 bool DynStackEvaluatable =
1781 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1782 StringRef DynamicStackStr =
1783 DynStackEvaluatable && DynStack ? "True" : "False";
1784 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1785 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1786 getMCExprStr(CurrentProgramInfo.Occupancy));
1787 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1788 CurrentProgramInfo.SGPRSpill);
1789 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1790 CurrentProgramInfo.VGPRSpill);
1791 if (isModuleEntryFunction)
1792 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1793 CurrentProgramInfo.LDSSize);
1794}
1795
1796char AMDGPUAsmPrinter::ID = 0;
1797
1798INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1799 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1132
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1270
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1252
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1168
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1244
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1203
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1265
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1155
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1154
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1163
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1202
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1141
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1263
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1205
#define R_SPILLED_SGPRS
Definition SIDefines.h:1284
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1251
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1262
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1146
#define R_SPILLED_VGPRS
Definition SIDefines.h:1285
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1140
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1165
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1139
static const int BlockSize
Definition TarWriter.cpp:33
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:96
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:99
const MCAsmInfo * MAI
Target Asm Printer information.
Definition AsmPrinter.h:102
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:117
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:129
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:136
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:120
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:106
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:111
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:303
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
bool hasMAIInsts() const
bool hasSGPRInitBug() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
bool dumpCode() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isTrapHandlerEnabled() const
bool isWave32() const
bool supportsWGP() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:328
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:132
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:416
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:517
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:604
bool hasInstructions() const
Definition MCSection.h:612
MCContext & getContext() const
Definition MCStreamer.h:314
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:101
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:273
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:422
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
bool isGFX1250(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1147
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1430
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1867
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:867
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
uint64_t getFunctionCodeSize(const MachineFunction &MF, bool IsLowerBound=false)
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.