LLVM 23.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "AMDGPUTargetMachine.h"
24#include "GCNSubtarget.h"
29#include "R600AsmPrinter.h"
42#include "llvm/MC/MCAssembler.h"
43#include "llvm/MC/MCContext.h"
45#include "llvm/MC/MCStreamer.h"
46#include "llvm/MC/MCValue.h"
53
54using namespace llvm;
55using namespace llvm::AMDGPU;
56
57// This should get the default rounding mode from the kernel. We just set the
58// default here, but this could change if the OpenCL rounding mode pragmas are
59// used.
60//
61// The denormal mode here should match what is reported by the OpenCL runtime
62// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
63// can also be override to flush with the -cl-denorms-are-zero compiler flag.
64//
65// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
66// precision, and leaves single precision to flush all and does not report
67// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
68// CL_FP_DENORM for both.
69//
70// FIXME: It seems some instructions do not support single precision denormals
71// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
72// and sin_f32, cos_f32 on most parts).
73
74// We want to use these instructions, and using fp32 denormals also causes
75// instructions to run at the double precision rate for the device so it's
76// probably best to just report no single precision denormals.
83
84static AsmPrinter *
86 std::unique_ptr<MCStreamer> &&Streamer) {
87 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
88}
89
97
98namespace {
99class AMDGPUAsmPrinterHandler : public AsmPrinterHandler {
100protected:
101 AMDGPUAsmPrinter *Asm;
102
103public:
104 AMDGPUAsmPrinterHandler(AMDGPUAsmPrinter *A) : Asm(A) {}
105
106 void beginFunction(const MachineFunction *MF) override {}
107
108 void endFunction(const MachineFunction *MF) override { Asm->endFunction(MF); }
109
110 void endModule() override {}
111};
112} // End anonymous namespace
113
115 std::unique_ptr<MCStreamer> Streamer)
116 : AsmPrinter(TM, std::move(Streamer)) {
117 assert(OutStreamer && "AsmPrinter constructed without streamer");
118}
119
121 return "AMDGPU Assembly Printer";
122}
123
125 return TM.getMCSubtargetInfo();
126}
127
129 if (!OutStreamer)
130 return nullptr;
131 return static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
132}
133
137
138void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
140
141 // TODO: Which one is called first, emitStartOfAsmFile or
142 // emitFunctionBodyStart?
143 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
144 initializeTargetID(M);
145
148 return;
149
151
154 CodeObjectVersion);
155 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
156 }
157
160}
161
163 // Init target streamer if it has not yet happened
165 initTargetStreamer(M);
166
167 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
169
170 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
171 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
172 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
173 HSAMetadataStream->end();
174 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
175 (void)Success;
176 assert(Success && "Malformed HSA Metadata");
177 }
178}
179
181 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
182 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
183 const Function &F = MF->getFunction();
184
185 // TODO: We're checking this late, would be nice to check it earlier.
186 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
188 STM.getCPU() + " is only available on code object version 6 or better");
189 }
190
191 // TODO: Which one is called first, emitStartOfAsmFile or
192 // emitFunctionBodyStart?
193 if (!getTargetStreamer()->getTargetID())
194 initializeTargetID(*F.getParent());
195
196 const auto &FunctionTargetID = STM.getTargetID();
197 // Make sure function's xnack settings are compatible with module's
198 // xnack settings.
199 if (FunctionTargetID.isXnackSupported() &&
200 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
201 FunctionTargetID.getXnackSetting() !=
202 getTargetStreamer()->getTargetID()->getXnackSetting()) {
203 OutContext.reportError(
204 {}, "xnack setting of '" + Twine(MF->getName()) +
205 "' function does not match module xnack setting");
206 return;
207 }
208 // Make sure function's sramecc settings are compatible with module's
209 // sramecc settings.
210 if (FunctionTargetID.isSramEccSupported() &&
211 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
212 FunctionTargetID.getSramEccSetting() !=
213 getTargetStreamer()->getTargetID()->getSramEccSetting()) {
214 OutContext.reportError(
215 {}, "sramecc setting of '" + Twine(MF->getName()) +
216 "' function does not match module sramecc setting");
217 return;
218 }
219
220 if (!MFI.isEntryFunction())
221 return;
222
223 if (STM.isMesaKernel(F) &&
224 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
225 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
226 AMDGPUMCKernelCodeT KernelCode;
227 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
228 KernelCode.validate(&STM, MF->getContext());
230 }
231
232 if (STM.isAmdHsaOS())
233 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
234}
235
237 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
238 if (!MFI.isEntryFunction())
239 return;
240
241 assert(TM.getTargetTriple().getOS() == Triple::AMDHSA);
242
243 auto &Streamer = getTargetStreamer()->getStreamer();
244 auto &Context = Streamer.getContext();
245 auto &ObjectFileInfo = *Context.getObjectFileInfo();
246 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
247
248 Streamer.pushSection();
249 Streamer.switchSection(&ReadOnlySection);
250
251 // CP microcode requires the kernel descriptor to be allocated on 64 byte
252 // alignment.
253 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
254 ReadOnlySection.ensureMinAlignment(Align(64));
255
256 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
257
258 SmallString<128> KernelName;
259 getNameWithPrefix(KernelName, &MF->getFunction());
261 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
262 CurrentProgramInfo.NumVGPRsForWavesPerEU,
264 CurrentProgramInfo.NumSGPRsForWavesPerEU,
266 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
267 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
268 Context),
269 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
270
271 Streamer.popSection();
272}
273
275 Register RegNo = MI->getOperand(0).getReg();
276
278 raw_svector_ostream OS(Str);
279 OS << "implicit-def: "
280 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
281
282 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
283 OS << " : SGPR spill to VGPR lane";
284
285 OutStreamer->AddComment(OS.str());
286 OutStreamer->addBlankLine();
287}
288
290 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
292 return;
293 }
294
295 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
296 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
297 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
298 SmallString<128> SymbolName;
299 getNameWithPrefix(SymbolName, &MF->getFunction()),
302 }
303 if (DumpCodeInstEmitter) {
304 // Disassemble function name label to text.
305 DisasmLines.push_back(MF->getName().str() + ":");
306 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
307 HexLines.emplace_back("");
308 }
309
311}
312
314 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
315 // Write a line for the basic block label if it is not only fallthrough.
316 DisasmLines.push_back((Twine("BB") + Twine(getFunctionNumber()) + "_" +
317 Twine(MBB.getNumber()) + ":")
318 .str());
319 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
320 HexLines.emplace_back("");
321 }
323}
324
327 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
328 OutContext.reportError({},
329 Twine(GV->getName()) +
330 ": unsupported initializer for address space");
331 return;
332 }
333
334 const Triple::OSType OS = TM.getTargetTriple().getOS();
335 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
337 return;
338 // With object linking, LDS definitions should have been externalized
339 // by earlier passes (e.g. LDS lowering, named barrier lowering).
340 // Only declarations reach here, emitted as SHN_AMDGPU_LDS symbols
341 // so the linker can assign their offsets.
342 assert(GV->isDeclaration() &&
343 "LDS definitions should have been externalized when object "
344 "linking is enabled");
345 }
346
347 MCSymbol *GVSym = getSymbol(GV);
348
349 GVSym->redefineIfPossible();
350 if (GVSym->isDefined() || GVSym->isVariable())
351 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
352 "' is already defined");
353
354 const DataLayout &DL = GV->getDataLayout();
356 Align Alignment = GV->getAlign().value_or(Align(4));
357
358 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
359 emitLinkage(GV, GVSym);
360 auto *TS = getTargetStreamer();
361 TS->emitAMDGPULDS(GVSym, Size, Alignment);
362 return;
363 }
364
366}
367
369 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
370
371 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
372 switch (CodeObjectVersion) {
374 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
375 break;
377 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
378 break;
380 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
381 break;
382 default:
383 reportFatalUsageError("unsupported code object version");
384 }
385
386 addAsmPrinterHandler(std::make_unique<AMDGPUAsmPrinterHandler>(this));
387 }
388
390}
391
392/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
393///
394/// Remove dependency on GCNSubtarget and depend only only the necessary values
395/// for said occupancy computation. Should match computeOccupancy implementation
396/// without passing \p STM on.
397const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
398 const MCExpr *NumVGPRs,
399 unsigned DynamicVGPRBlockSize,
400 const GCNSubtarget &STM, MCContext &Ctx) {
401 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
402 unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
403 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
404 unsigned Generation = STM.getGeneration();
405
406 auto CreateExpr = [&Ctx](unsigned Value) {
407 return MCConstantExpr::create(Value, Ctx);
408 };
409
411 {CreateExpr(MaxWaves), CreateExpr(Granule),
412 CreateExpr(TargetTotalNumVGPRs),
413 CreateExpr(Generation), CreateExpr(InitOcc),
414 NumSGPRs, NumVGPRs},
415 Ctx);
416}
417
418void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
419 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
420 return;
421
423 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
424 MCSymbol *FnSym = TM.getSymbol(&F);
425
426 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
427 int64_t Val;
428 if (Value->evaluateAsAbsolute(Val)) {
429 Res = Val;
430 return true;
431 }
432 return false;
433 };
434
435 const uint64_t MaxScratchPerWorkitem =
437 MCSymbol *ScratchSizeSymbol =
438 RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
439 uint64_t ScratchSize;
440 if (ScratchSizeSymbol->isVariable() &&
441 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
442 ScratchSize > MaxScratchPerWorkitem) {
443 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
444 DS_Error);
445 F.getContext().diagnose(DiagStackSize);
446 }
447
448 // Validate addressable scalar registers (i.e., prior to added implicit
449 // SGPRs).
450 MCSymbol *NumSGPRSymbol =
451 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
453 !STM.hasSGPRInitBug()) {
454 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
455 uint64_t NumSgpr;
456 if (NumSGPRSymbol->isVariable() &&
457 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
458 NumSgpr > MaxAddressableNumSGPRs) {
459 F.getContext().diagnose(DiagnosticInfoResourceLimit(
460 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
462 return;
463 }
464 }
465
466 MCSymbol *VCCUsedSymbol =
467 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
468 MCSymbol *FlatUsedSymbol =
469 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
470 uint64_t VCCUsed, FlatUsed, NumSgpr;
471
472 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
473 FlatUsedSymbol->isVariable() &&
474 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
475 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
476 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
477
478 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
479 // resolvable.
480 NumSgpr += IsaInfo::getNumExtraSGPRs(
481 &STM, VCCUsed, FlatUsed,
482 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
484 STM.hasSGPRInitBug()) {
485 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
486 if (NumSgpr > MaxAddressableNumSGPRs) {
487 F.getContext().diagnose(DiagnosticInfoResourceLimit(
488 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
490 return;
491 }
492 }
493
494 MCSymbol *NumVgprSymbol =
495 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
496 MCSymbol *NumAgprSymbol =
497 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
498 uint64_t NumVgpr, NumAgpr;
499
500 MachineModuleInfo &MMI =
502 MachineFunction *MF = MMI.getMachineFunction(F);
503 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
504 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
505 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
506 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
507 unsigned MaxWaves = MFI.getMaxWavesPerEU();
508 uint64_t TotalNumVgpr =
509 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
510 uint64_t NumVGPRsForWavesPerEU =
511 std::max({TotalNumVgpr, (uint64_t)1,
512 (uint64_t)STM.getMinNumVGPRs(
513 MaxWaves, MFI.getDynamicVGPRBlockSize())});
514 uint64_t NumSGPRsForWavesPerEU = std::max(
515 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
516 const MCExpr *OccupancyExpr = createOccupancy(
517 STM.getOccupancyWithWorkGroupSizes(*MF).second,
518 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
519 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
521 uint64_t Occupancy;
522
523 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
524 F, "amdgpu-waves-per-eu", {0, 0}, true);
525
526 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
527 DiagnosticInfoOptimizationFailure Diag(
528 F, F.getSubprogram(),
529 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
530 "'" +
531 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
532 ", final occupancy is " + Twine(Occupancy));
533 F.getContext().diagnose(Diag);
534 return;
535 }
536 }
537 }
538}
539
541 // Pad with s_code_end to help tools and guard against instruction prefetch
542 // causing stale data in caches. Arguably this should be done by the linker,
543 // which is why this isn't done for Mesa.
544 // Don't do it if there is no code.
545 const MCSubtargetInfo &STI = *getGlobalSTI();
546 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
550 if (TextSect->hasInstructions()) {
551 OutStreamer->switchSection(TextSect);
553 }
554 }
555
556 // Assign expressions which can only be resolved when all other functions are
557 // known.
558 RI.finalize(OutContext);
559
560 // Switch section and emit all GPR maximums within the processed module.
561 OutStreamer->pushSection();
562 MCSectionELF *MaxGPRSection =
563 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
564 OutStreamer->switchSection(MaxGPRSection);
566 RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
567 RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
568 OutStreamer->popSection();
569
570 for (Function &F : M.functions())
571 validateMCResourceInfo(F);
572
573 RI.reset();
574
576}
577
578SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
580 raw_svector_ostream OSS(Str);
581 auto &Streamer = getTargetStreamer()->getStreamer();
582 auto &Context = Streamer.getContext();
583 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
584 printAMDGPUMCExpr(New, OSS, MAI);
585 return Str;
586}
587
588// Print comments that apply to both callable functions and entry points.
589void AMDGPUAsmPrinter::emitCommonFunctionComments(
590 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
591 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
592 const AMDGPUMachineFunctionInfo *MFI) {
593 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
594 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
595 false);
596 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
597 if (NumAGPR && TotalNumVGPR) {
598 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
599 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
600 false);
601 }
602 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
603 false);
604 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
605 false);
606}
607
608const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
609 const MachineFunction &MF) const {
610 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
611 MCContext &Ctx = MF.getContext();
612 uint16_t KernelCodeProperties = 0;
613 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
614
615 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
616 KernelCodeProperties |=
617 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
618 }
619 if (UserSGPRInfo.hasDispatchPtr()) {
620 KernelCodeProperties |=
621 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
622 }
623 if (UserSGPRInfo.hasQueuePtr()) {
624 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
625 }
626 if (UserSGPRInfo.hasKernargSegmentPtr()) {
627 KernelCodeProperties |=
628 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
629 }
630 if (UserSGPRInfo.hasDispatchID()) {
631 KernelCodeProperties |=
632 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
633 }
634 if (UserSGPRInfo.hasFlatScratchInit()) {
635 KernelCodeProperties |=
636 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
637 }
638 if (UserSGPRInfo.hasPrivateSegmentSize()) {
639 KernelCodeProperties |=
640 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
641 }
642 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
643 KernelCodeProperties |=
644 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
645 }
646
647 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
648 // un-evaluatable at this point so it cannot be conditionally checked here.
649 // Instead, we'll directly shift the possibly unknown MCExpr into its place
650 // and bitwise-or it into KernelCodeProperties.
651 const MCExpr *KernelCodePropExpr =
652 MCConstantExpr::create(KernelCodeProperties, Ctx);
653 const MCExpr *OrValue = MCConstantExpr::create(
654 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
655 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
656 OrValue, Ctx);
657 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
658
659 return KernelCodePropExpr;
660}
661
662MCKernelDescriptor
663AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
664 const SIProgramInfo &PI) const {
665 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
666 const Function &F = MF.getFunction();
667 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
668 MCContext &Ctx = MF.getContext();
669
670 MCKernelDescriptor KernelDescriptor;
671
672 KernelDescriptor.group_segment_fixed_size =
674 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
675
676 Align MaxKernArgAlign;
677 KernelDescriptor.kernarg_size = MCConstantExpr::create(
678 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
679
680 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
681 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
682 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
683
684 int64_t PGM_Rsrc3 = 1;
685 bool EvaluatableRsrc3 =
686 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGM_Rsrc3);
687 (void)PGM_Rsrc3;
688 (void)EvaluatableRsrc3;
690 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
691 static_cast<uint64_t>(PGM_Rsrc3) == 0);
692 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
693
694 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
695 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
696 Ctx);
697
698 return KernelDescriptor;
699}
700
702 // Init target streamer lazily on the first function so that previous passes
703 // can set metadata.
705 initTargetStreamer(*MF.getFunction().getParent());
706
707 ResourceUsage =
709 CurrentProgramInfo.reset(MF);
710
711 const AMDGPUMachineFunctionInfo *MFI =
712 MF.getInfo<AMDGPUMachineFunctionInfo>();
713 MCContext &Ctx = MF.getContext();
714
715 // The starting address of all shader programs must be 256 bytes aligned.
716 // Regular functions just need the basic required instruction alignment.
717 MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
718
720
721 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
723 // FIXME: This should be an explicit check for Mesa.
724 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
725 MCSectionELF *ConfigSection =
726 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
727 OutStreamer->switchSection(ConfigSection);
728 }
729
730 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
731
732 if (MFI->isModuleEntryFunction()) {
733 getSIProgramInfo(CurrentProgramInfo, MF);
734 }
735
736 if (STM.isAmdPalOS()) {
737 if (MFI->isEntryFunction())
738 EmitPALMetadata(MF, CurrentProgramInfo);
739 else if (MFI->isModuleEntryFunction())
740 emitPALFunctionMetadata(MF);
741 } else if (!STM.isAmdHsaOS()) {
742 EmitProgramInfoSI(MF, CurrentProgramInfo);
743 }
744
745 DumpCodeInstEmitter = nullptr;
746 if (STM.dumpCode()) {
747 // For -dumpcode, get the assembler out of the streamer. This only works
748 // with -filetype=obj.
749 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
750 if (Assembler)
751 DumpCodeInstEmitter = Assembler->getEmitterPtr();
752 }
753
754 DisasmLines.clear();
755 HexLines.clear();
757
759
760 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
761 STM.hasMAIInsts());
762
763 {
766 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
767 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
768 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
769 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
770 OutContext),
771 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
772 OutContext),
773 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
774 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
775 OutContext),
776 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
777 OutContext),
778 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
779 OutContext),
780 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
781 OutContext));
782 }
783
784 // Emit _dvgpr$ symbol when appropriate.
785 emitDVgprSymbol(MF);
786
787 if (isVerbose()) {
788 MCSectionELF *CommentSection =
789 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
790 OutStreamer->switchSection(CommentSection);
791
792 if (!MFI->isEntryFunction()) {
794 OutStreamer->emitRawComment(" Function info:", false);
795
796 emitCommonFunctionComments(
797 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
798 ->getVariableValue(),
799 STM.hasMAIInsts() ? RI.getSymbol(CurrentFnSym->getName(),
800 RIK::RIK_NumAGPR, OutContext)
801 ->getVariableValue()
802 : nullptr,
803 RI.createTotalNumVGPRs(MF, Ctx),
804 RI.createTotalNumSGPRs(
805 MF,
806 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
807 Ctx),
808 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
810 ->getVariableValue(),
811 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
812 return false;
813 }
814
815 OutStreamer->emitRawComment(" Kernel info:", false);
816 emitCommonFunctionComments(
817 CurrentProgramInfo.NumArchVGPR,
818 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
819 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
820 CurrentProgramInfo.ScratchSize,
821 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
822
823 OutStreamer->emitRawComment(
824 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
825 OutStreamer->emitRawComment(
826 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
827 OutStreamer->emitRawComment(
828 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
829 " bytes/workgroup (compile time only)",
830 false);
831
832 OutStreamer->emitRawComment(
833 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
834
835 OutStreamer->emitRawComment(
836 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
837
838 OutStreamer->emitRawComment(
839 " NumSGPRsForWavesPerEU: " +
840 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
841 false);
842 OutStreamer->emitRawComment(
843 " NumVGPRsForWavesPerEU: " +
844 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
845 false);
846
847 if (STM.hasGFX90AInsts()) {
848 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
849 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
850 AdjustedAccum = MCBinaryExpr::createMul(
851 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
852 OutStreamer->emitRawComment(
853 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
854 }
855
856 if (STM.hasGFX1250Insts())
857 OutStreamer->emitRawComment(
858 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
859 false);
860
861 OutStreamer->emitRawComment(
862 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
863
864 OutStreamer->emitRawComment(
865 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
866
867 OutStreamer->emitRawComment(
868 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
869 getMCExprStr(CurrentProgramInfo.ScratchEnable),
870 false);
871 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
872 Twine(CurrentProgramInfo.UserSGPR),
873 false);
874 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
875 Twine(CurrentProgramInfo.TrapHandlerEnable),
876 false);
877 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
878 Twine(CurrentProgramInfo.TGIdXEnable),
879 false);
880 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
881 Twine(CurrentProgramInfo.TGIdYEnable),
882 false);
883 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
884 Twine(CurrentProgramInfo.TGIdZEnable),
885 false);
886 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
887 Twine(CurrentProgramInfo.TIdIGCompCount),
888 false);
889
890 [[maybe_unused]] int64_t PGMRSrc3;
892 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
893 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
894 static_cast<uint64_t>(PGMRSrc3) == 0));
895 if (STM.hasGFX90AInsts()) {
896 OutStreamer->emitRawComment(
897 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
898 getMCExprStr(MCKernelDescriptor::bits_get(
899 CurrentProgramInfo.ComputePGMRSrc3,
900 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
901 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
902 false);
903 OutStreamer->emitRawComment(
904 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
905 getMCExprStr(MCKernelDescriptor::bits_get(
906 CurrentProgramInfo.ComputePGMRSrc3,
907 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
908 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
909 false);
910 }
911 }
912
913 if (DumpCodeInstEmitter) {
914
915 OutStreamer->switchSection(
916 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
917
918 for (size_t i = 0; i < DisasmLines.size(); ++i) {
919 std::string Comment = "\n";
920 if (!HexLines[i].empty()) {
921 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
922 Comment += " ; " + HexLines[i] + "\n";
923 }
924
925 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
926 OutStreamer->emitBytes(StringRef(Comment));
927 }
928 }
929
930 return false;
931}
932
933// When appropriate, add a _dvgpr$ symbol, with the value of the function
934// symbol, plus an offset encoding one less than the number of VGPR blocks used
935// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
936// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
937// used by a front-end to have functions that are chained rather than called,
938// and a dispatcher that dynamically resizes the VGPR count before dispatching
939// to a function.
940void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
942 if (MFI.isDynamicVGPREnabled() &&
944 MCContext &Ctx = MF.getContext();
945 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
946 MCValue NumVGPRs;
947 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
948 NumVGPRs, nullptr) ||
949 !NumVGPRs.isAbsolute()) {
950 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
951 }
952 // Calculate number of VGPR blocks.
953 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
954 unsigned NumBlocks =
955 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
956
957 if (NumBlocks > 8) {
959 "too many DVGPR blocks for _dvgpr$ symbol for '" +
960 Twine(CurrentFnSym->getName()) + "'");
961 return;
962 }
963 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
964 // Add to function symbol to create _dvgpr$ symbol.
965 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
967 MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
968 MCSymbol *DVgprFuncSym =
969 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
970 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
971 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
972 emitLinkage(&MF.getFunction(), DVgprFuncSym);
973 }
974}
975
976// TODO: Fold this into emitFunctionBodyStart.
977void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
978 // In the beginning all features are either 'Any' or 'NotSupported',
979 // depending on global target features. This will cover empty modules.
981 getGlobalSTI()->getFeatureString());
982
983 // If module is empty, we are done.
984 if (M.empty())
985 return;
986
987 // If module is not empty, need to find first 'Off' or 'On' feature
988 // setting per feature from functions in module.
989 for (auto &F : M) {
990 auto &TSTargetID = getTargetStreamer()->getTargetID();
991 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
992 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
993 break;
994
995 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
996 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
997 if (TSTargetID->isXnackSupported())
998 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
999 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
1000 if (TSTargetID->isSramEccSupported())
1001 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
1002 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
1003 }
1004}
1005
1006// AccumOffset computed for the MCExpr equivalent of:
1007// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
1008static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
1009 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
1010 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
1011
1012 // Can't be lower than 1 for subsequent alignTo.
1013 const MCExpr *MaximumTaken =
1014 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
1015
1016 // Practically, it's computing divideCeil(MaximumTaken, 4).
1017 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
1018 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
1019 Ctx);
1020
1021 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
1022}
1023
1024void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1025 const MachineFunction &MF) {
1026 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1027 MCContext &Ctx = MF.getContext();
1028
1029 auto CreateExpr = [&Ctx](int64_t Value) {
1030 return MCConstantExpr::create(Value, Ctx);
1031 };
1032
1033 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1034 int64_t Val;
1035 if (Value->evaluateAsAbsolute(Val)) {
1036 Res = Val;
1037 return true;
1038 }
1039 return false;
1040 };
1041
1042 auto GetSymRefExpr =
1043 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1044 MCSymbol *Sym = RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext);
1045 return MCSymbolRefExpr::create(Sym, Ctx);
1046 };
1047
1049 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1050 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1052 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1053
1054 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1055 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1056 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1057 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1058 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1059 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1060 ProgInfo.DynamicCallStack =
1061 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1062 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1063
1064 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1065 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1066 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1067 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1068
1069 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1070
1071 // The calculations related to SGPR/VGPR blocks are
1072 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1073 // unified.
1074 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1075 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1076 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1077
1078 // Check the addressable register limit before we add ExtraSGPRs.
1080 !STM.hasSGPRInitBug()) {
1081 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1082 uint64_t NumSgpr;
1083 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1084 NumSgpr > MaxAddressableNumSGPRs) {
1085 // This can happen due to a compiler bug or when using inline asm.
1086 LLVMContext &Ctx = MF.getFunction().getContext();
1087 Ctx.diagnose(DiagnosticInfoResourceLimit(
1088 MF.getFunction(), "addressable scalar registers", NumSgpr,
1089 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1090 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1091 }
1092 }
1093
1094 // Account for extra SGPRs and VGPRs reserved for debugger use.
1095 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1096
1097 const Function &F = MF.getFunction();
1098
1099 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1100 // dispatch registers as function args.
1101 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1102 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1103
1104 if (WaveDispatchNumSGPR) {
1106 {ProgInfo.NumSGPR,
1107 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1108 Ctx)},
1109 Ctx);
1110 }
1111
1112 if (WaveDispatchNumVGPR) {
1114 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1115
1117 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1118 }
1119
1120 // Adjust number of registers used to meet default/requested minimum/maximum
1121 // number of waves per execution unit request.
1122 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1123 ProgInfo.NumSGPRsForWavesPerEU =
1124 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1125 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1126 Ctx);
1127 ProgInfo.NumVGPRsForWavesPerEU =
1128 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1129 CreateExpr(STM.getMinNumVGPRs(
1130 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1131 Ctx);
1132
1134 STM.hasSGPRInitBug()) {
1135 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1136 uint64_t NumSgpr;
1137 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1138 NumSgpr > MaxAddressableNumSGPRs) {
1139 // This can happen due to a compiler bug or when using inline asm to use
1140 // the registers which are usually reserved for vcc etc.
1141 LLVMContext &Ctx = MF.getFunction().getContext();
1142 Ctx.diagnose(DiagnosticInfoResourceLimit(
1143 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1145 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1146 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1147 }
1148 }
1149
1150 if (STM.hasSGPRInitBug()) {
1151 ProgInfo.NumSGPR =
1153 ProgInfo.NumSGPRsForWavesPerEU =
1155 }
1156
1157 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1158 LLVMContext &Ctx = MF.getFunction().getContext();
1159 Ctx.diagnose(DiagnosticInfoResourceLimit(
1160 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1162 }
1163
1164 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1165 LLVMContext &Ctx = MF.getFunction().getContext();
1166 Ctx.diagnose(DiagnosticInfoResourceLimit(
1167 MF.getFunction(), "local memory", MFI->getLDSSize(),
1169 }
1170 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1171 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1172 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1173 unsigned Granule) {
1174 const MCExpr *OneConst = CreateExpr(1ul);
1175 const MCExpr *GranuleConst = CreateExpr(Granule);
1176 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1177 const MCExpr *AlignToGPR =
1178 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1179 const MCExpr *DivGPR =
1180 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1181 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1182 return SubGPR;
1183 };
1184 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1186 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1187 } else {
1188 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1190 }
1191 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1193
1194 const SIModeRegisterDefaults Mode = MFI->getMode();
1195
1196 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1197 // register.
1198 ProgInfo.FloatMode = getFPMode(Mode);
1199
1200 ProgInfo.IEEEMode = Mode.IEEE;
1201
1202 // Make clamp modifier on NaN input returns 0.
1203 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1204
1205 unsigned LDSAlignShift = 8;
1206 switch (getLdsDwGranularity(STM)) {
1207 case 512:
1208 case 320:
1209 LDSAlignShift = 11;
1210 break;
1211 case 128:
1212 LDSAlignShift = 9;
1213 break;
1214 case 64:
1215 LDSAlignShift = 8;
1216 break;
1217 default:
1218 llvm_unreachable("invald LDS block size");
1219 }
1220
1221 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1222 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1223
1224 ProgInfo.LDSSize = MFI->getLDSSize();
1225 ProgInfo.LDSBlocks =
1226 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1227
1228 // The MCExpr equivalent of divideCeil.
1229 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1230 const MCExpr *Ceil =
1231 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1232 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1233 };
1234
1235 // Scratch is allocated in 64-dword or 256-dword blocks.
1236 unsigned ScratchAlignShift =
1237 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1238 // We need to program the hardware with the amount of scratch memory that
1239 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1240 // scratch memory used per thread.
1241 ProgInfo.ScratchBlocks = DivideCeil(
1243 CreateExpr(STM.getWavefrontSize()), Ctx),
1244 CreateExpr(1ULL << ScratchAlignShift));
1245
1246 if (STM.supportsWGP()) {
1247 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1248 }
1249
1250 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1251 ProgInfo.MemOrdered = 1;
1252 ProgInfo.FwdProgress = !F.hasFnAttribute("amdgpu-no-fwd-progress");
1253 }
1254
1255 // 0 = X, 1 = XY, 2 = XYZ
1256 unsigned TIDIGCompCnt = 0;
1257 if (MFI->hasWorkItemIDZ())
1258 TIDIGCompCnt = 2;
1259 else if (MFI->hasWorkItemIDY())
1260 TIDIGCompCnt = 1;
1261
1262 // The private segment wave byte offset is the last of the system SGPRs. We
1263 // initially assumed it was allocated, and may have used it. It shouldn't harm
1264 // anything to disable it if we know the stack isn't used here. We may still
1265 // have emitted code reading it to initialize scratch, but if that's unused
1266 // reading garbage should be OK.
1269 MCConstantExpr::create(0, Ctx), Ctx),
1270 ProgInfo.DynamicCallStack, Ctx);
1271
1272 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1273 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1274 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1275 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1276 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1277 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1278 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1279 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1280 ProgInfo.EXCPEnMSB = 0;
1281 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1282 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1283 ProgInfo.EXCPEnable = 0;
1284
1285 // return ((Dst & ~Mask) | (Value << Shift))
1286 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1287 uint32_t Shift) {
1288 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1289 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1290 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1292 Ctx);
1293 return Dst;
1294 };
1295
1296 if (STM.hasGFX90AInsts()) {
1297 ProgInfo.ComputePGMRSrc3 =
1298 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1299 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1300 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1301 ProgInfo.ComputePGMRSrc3 =
1302 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1303 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1304 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1305 }
1306
1307 if (STM.hasGFX1250Insts())
1308 ProgInfo.ComputePGMRSrc3 =
1309 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1310 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1311 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1312
1313 ProgInfo.Occupancy = createOccupancy(
1314 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1316 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1317
1318 const auto [MinWEU, MaxWEU] =
1319 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1320 uint64_t Occupancy;
1321 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1322 DiagnosticInfoOptimizationFailure Diag(
1323 F, F.getSubprogram(),
1324 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1325 "'" +
1326 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1327 ", final occupancy is " + Twine(Occupancy));
1328 F.getContext().diagnose(Diag);
1329 }
1330
1331 if (isGFX11Plus(STM)) {
1332 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1333 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1334 (uint64_t)std::numeric_limits<uint32_t>::max());
1335 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1336 uint32_t Field, Shift, Width;
1337 if (isGFX11(STM)) {
1338 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1339 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1340 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1341 } else {
1342 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1343 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1344 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1345 }
1346 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1347 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1348 CreateExpr(InstPrefSize), Field, Shift);
1349 }
1350}
1351
1364
1365void AMDGPUAsmPrinter::EmitProgramInfoSI(
1366 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1367 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1368 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1369 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1370 MCContext &Ctx = MF.getContext();
1371
1372 // (((Value) & Mask) << Shift)
1373 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1374 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1375 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1377 shft, Ctx);
1378 };
1379
1380 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1381 int64_t Val;
1382 if (Value->evaluateAsAbsolute(Val))
1383 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1384 else
1385 OutStreamer->emitValue(Value, Size);
1386 };
1387
1388 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1390
1391 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1392 /*Size=*/4);
1393
1395 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1396
1398
1399 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1400 // appropriate generation.
1401 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1402 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1403 /*Mask=*/0x3FFFF, /*Shift=*/12),
1404 /*Size=*/4);
1405 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1406 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1407 /*Mask=*/0x7FFF, /*Shift=*/12),
1408 /*Size=*/4);
1409 } else {
1410 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1411 /*Mask=*/0x1FFF, /*Shift=*/12),
1412 /*Size=*/4);
1413 }
1414
1415 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1416 // 0" comment but I don't see a corresponding field in the register spec.
1417 } else {
1418 OutStreamer->emitInt32(RsrcReg);
1419
1420 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1421 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1422 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1423 MF.getContext());
1424 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1426
1427 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1428 // appropriate generation.
1429 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1430 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1431 /*Mask=*/0x3FFFF, /*Shift=*/12),
1432 /*Size=*/4);
1433 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1434 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1435 /*Mask=*/0x7FFF, /*Shift=*/12),
1436 /*Size=*/4);
1437 } else {
1438 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1439 /*Mask=*/0x1FFF, /*Shift=*/12),
1440 /*Size=*/4);
1441 }
1442 }
1443
1444 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1446 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1447 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1448 : CurrentProgramInfo.LDSBlocks;
1449 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1451 OutStreamer->emitInt32(MFI->getPSInputEnable());
1453 OutStreamer->emitInt32(MFI->getPSInputAddr());
1454 }
1455
1456 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1457 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1458 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1459 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1460}
1461
1462// Helper function to add common PAL Metadata 3.0+
1464 const SIProgramInfo &CurrentProgramInfo,
1465 CallingConv::ID CC, const GCNSubtarget &ST,
1466 unsigned DynamicVGPRBlockSize) {
1467 if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1468 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1469
1470 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1471 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1472 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1473
1474 if (AMDGPU::isCompute(CC)) {
1475 MD->setHwStage(CC, ".trap_present",
1476 (bool)CurrentProgramInfo.TrapHandlerEnable);
1477 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1478
1479 if (DynamicVGPRBlockSize != 0)
1480 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1481 }
1482
1484 CC, ".lds_size",
1485 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1486 sizeof(uint32_t)));
1487}
1488
1489// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1490// is AMDPAL. It stores each compute/SPI register setting and other PAL
1491// metadata items into the PALMD::Metadata, combining with any provided by the
1492// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1493// is then written as a single block in the .note section.
1494void AMDGPUAsmPrinter::EmitPALMetadata(
1495 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1496 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1497 auto CC = MF.getFunction().getCallingConv();
1498 auto *MD = getTargetStreamer()->getPALMetadata();
1499 auto &Ctx = MF.getContext();
1500
1501 MD->setEntryPoint(CC, MF.getFunction().getName());
1502 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1503
1504 // For targets that support dynamic VGPRs, set the number of saved dynamic
1505 // VGPRs (if any) in the PAL metadata.
1506 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1507 if (MFI->isDynamicVGPREnabled() &&
1509 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1511
1512 // Only set AGPRs for supported devices
1513 if (STM.hasMAIInsts()) {
1514 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1515 }
1516
1517 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1518 if (MD->getPALMajorVersion() < 3) {
1519 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1520 if (AMDGPU::isCompute(CC)) {
1521 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1522 } else {
1523 const MCExpr *HasScratchBlocks =
1524 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1525 MCConstantExpr::create(0, Ctx), Ctx);
1526 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1527 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1528 }
1529 } else {
1530 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1531 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1532 CurrentProgramInfo.ScratchEnable);
1533 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1535 }
1536
1537 // ScratchSize is in bytes, 16 aligned.
1538 MD->setScratchSize(
1539 CC,
1540 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1541 MCConstantExpr::create(16, Ctx), Ctx),
1542 Ctx);
1543
1544 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1545 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1546 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1547 : CurrentProgramInfo.LDSBlocks;
1548 if (MD->getPALMajorVersion() < 3) {
1549 MD->setRsrc2(
1550 CC,
1552 Ctx);
1553 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1554 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1555 } else {
1556 // Graphics registers
1557 const unsigned ExtraLdsDwGranularity =
1558 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1559 MD->setGraphicsRegisters(
1560 ".ps_extra_lds_size",
1561 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1562
1563 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1564 static StringLiteral const PsInputFields[] = {
1565 ".persp_sample_ena", ".persp_center_ena",
1566 ".persp_centroid_ena", ".persp_pull_model_ena",
1567 ".linear_sample_ena", ".linear_center_ena",
1568 ".linear_centroid_ena", ".line_stipple_tex_ena",
1569 ".pos_x_float_ena", ".pos_y_float_ena",
1570 ".pos_z_float_ena", ".pos_w_float_ena",
1571 ".front_face_ena", ".ancillary_ena",
1572 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1573 unsigned PSInputEna = MFI->getPSInputEnable();
1574 unsigned PSInputAddr = MFI->getPSInputAddr();
1575 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1576 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1577 (bool)((PSInputEna >> Idx) & 1));
1578 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1579 (bool)((PSInputAddr >> Idx) & 1));
1580 }
1581 }
1582 }
1583
1584 // For version 3 and above the wave front size is already set in the metadata
1585 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1586 MD->setWave32(MF.getFunction().getCallingConv());
1587}
1588
1589void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1590 auto *MD = getTargetStreamer()->getPALMetadata();
1591 const MachineFrameInfo &MFI = MF.getFrameInfo();
1592 StringRef FnName = MF.getFunction().getName();
1593 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1594 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1595 MCContext &Ctx = MF.getContext();
1596
1597 if (MD->getPALMajorVersion() < 3) {
1598 // Set compute registers
1599 MD->setRsrc1(
1601 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1602 MD->setRsrc2(CallingConv::AMDGPU_CS,
1603 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1604 } else {
1606 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1607 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1608 }
1609
1610 // Set optional info
1611 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1612 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1613 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1614}
1615
1616// This is supposed to be log2(Size)
1618 switch (Size) {
1619 case 4:
1620 return AMD_ELEMENT_4_BYTES;
1621 case 8:
1622 return AMD_ELEMENT_8_BYTES;
1623 case 16:
1624 return AMD_ELEMENT_16_BYTES;
1625 default:
1626 llvm_unreachable("invalid private_element_size");
1627 }
1628}
1629
1630void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1631 const SIProgramInfo &CurrentProgramInfo,
1632 const MachineFunction &MF) const {
1633 const Function &F = MF.getFunction();
1634 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1635 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1636
1637 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1638 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1639 MCContext &Ctx = MF.getContext();
1640
1641 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1642
1644 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1646 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1648
1649 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1650
1652 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1653
1654 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1655 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1657 }
1658
1659 if (UserSGPRInfo.hasDispatchPtr())
1661
1662 if (UserSGPRInfo.hasQueuePtr())
1664
1665 if (UserSGPRInfo.hasKernargSegmentPtr())
1667
1668 if (UserSGPRInfo.hasDispatchID())
1670
1671 if (UserSGPRInfo.hasFlatScratchInit())
1673
1674 if (UserSGPRInfo.hasPrivateSegmentSize())
1676
1677 if (STM.isXNACKEnabled())
1679
1680 Align MaxKernArgAlign;
1681 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1682 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1683 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1684 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1685 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1686
1687 // kernarg_segment_alignment is specified as log of the alignment.
1688 // The minimum alignment is 16.
1689 // FIXME: The metadata treats the minimum as 4?
1690 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1691}
1692
1694 const char *ExtraCode, raw_ostream &O) {
1695 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1696 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1697 return false;
1698
1699 if (ExtraCode && ExtraCode[0]) {
1700 if (ExtraCode[1] != 0)
1701 return true; // Unknown modifier.
1702
1703 switch (ExtraCode[0]) {
1704 case 'r':
1705 break;
1706 default:
1707 return true;
1708 }
1709 }
1710
1711 // TODO: Should be able to support other operand types like globals.
1712 const MachineOperand &MO = MI->getOperand(OpNo);
1713 if (MO.isReg()) {
1715 *MF->getSubtarget().getRegisterInfo());
1716 return false;
1717 }
1718 if (MO.isImm()) {
1719 int64_t Val = MO.getImm();
1721 O << Val;
1722 } else if (isUInt<16>(Val)) {
1723 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1724 } else if (isUInt<32>(Val)) {
1725 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1726 } else {
1727 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1728 }
1729 return false;
1730 }
1731 return true;
1732}
1733
1741
1742void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1743 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1744 bool isModuleEntryFunction, bool hasMAIInsts) {
1745 if (!ORE)
1746 return;
1747
1748 const char *Name = "kernel-resource-usage";
1749 const char *Indent = " ";
1750
1751 // If the remark is not specifically enabled, do not output to yaml
1753 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1754 return;
1755
1756 // Currently non-kernel functions have no resources to emit.
1758 return;
1759
1760 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1761 StringRef RemarkLabel, auto Argument) {
1762 // Add an indent for every line besides the line with the kernel name. This
1763 // makes it easier to tell which resource usage go with which kernel since
1764 // the kernel name will always be displayed first.
1765 std::string LabelStr = RemarkLabel.str() + ": ";
1766 if (RemarkName != "FunctionName")
1767 LabelStr = Indent + LabelStr;
1768
1769 ORE->emit([&]() {
1770 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1772 &MF.front())
1773 << LabelStr << ore::NV(RemarkName, Argument);
1774 });
1775 };
1776
1777 // FIXME: Formatting here is pretty nasty because clang does not accept
1778 // newlines from diagnostics. This forces us to emit multiple diagnostic
1779 // remarks to simulate newlines. If and when clang does accept newlines, this
1780 // formatting should be aggregated into one remark with newlines to avoid
1781 // printing multiple diagnostic location and diag opts.
1782 EmitResourceUsageRemark("FunctionName", "Function Name",
1783 MF.getFunction().getName());
1784 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1785 getMCExprStr(CurrentProgramInfo.NumSGPR));
1786 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1787 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1788 if (hasMAIInsts) {
1789 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1790 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1791 }
1792 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1793 getMCExprStr(CurrentProgramInfo.ScratchSize));
1794 int64_t DynStack;
1795 bool DynStackEvaluatable =
1796 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1797 StringRef DynamicStackStr =
1798 DynStackEvaluatable && DynStack ? "True" : "False";
1799 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1800 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1801 getMCExprStr(CurrentProgramInfo.Occupancy));
1802 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1803 CurrentProgramInfo.SGPRSpill);
1804 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1805 CurrentProgramInfo.VGPRSpill);
1806 if (isModuleEntryFunction)
1807 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1808 CurrentProgramInfo.LDSSize);
1809}
1810
1811char AMDGPUAsmPrinter::ID = 0;
1812
1813INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1814 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1144
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1282
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1264
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1180
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1256
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1215
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1277
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1167
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1166
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1175
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1214
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1153
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1275
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1217
#define R_SPILLED_SGPRS
Definition SIDefines.h:1296
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1263
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1274
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1158
#define R_SPILLED_VGPRS
Definition SIDefines.h:1297
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1152
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1177
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1151
static const int BlockSize
Definition TarWriter.cpp:33
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
void endFunction(const MachineFunction *MF)
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, const MCSymbol *MaxNamedBarrier)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Collects and handles AsmPrinter objects required to build debug or EH information.
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:91
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:94
const MCAsmInfo * MAI
Target Asm Printer information.
Definition AsmPrinter.h:97
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:109
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:121
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:128
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:112
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:101
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:106
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:310
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
void addAsmPrinterHandler(std::unique_ptr< AsmPrinterHandler > Handler)
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isWave32() const
bool supportsWGP() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:337
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:413
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:569
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:657
bool hasInstructions() const
Definition MCSection.h:665
MCContext & getContext() const
Definition MCStreamer.h:323
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:103
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:273
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:438
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1148
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1431
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
Target & getTheGCNTarget()
The target for GCN GPUs.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
uint64_t getFunctionCodeSize(const MachineFunction &MF, bool IsLowerBound=false)
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.