LLVM 23.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
81
82static AsmPrinter *
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
95
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
103 return "AMDGPU Assembly Printer";
104}
105
107 return TM.getMCSubtargetInfo();
108}
109
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
130 return;
131
133
136 CodeObjectVersion);
137 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
138 }
139
142}
143
145 // Init target streamer if it has not yet happened
147 initTargetStreamer(M);
148
149 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
154 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
163 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
170 STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(*F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
206 KernelCode.validate(&STM, MF->getContext());
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
212}
213
215 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216 if (!MFI.isEntryFunction())
217 return;
218
219 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(&ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
233 ReadOnlySection.ensureMinAlignment(Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(KernelName, &MF->getFunction());
240 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
241 CurrentProgramInfo.NumVGPRsForWavesPerEU,
243 CurrentProgramInfo.NumSGPRsForWavesPerEU,
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
247 Context),
248 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
254 Register RegNo = MI->getOperand(0).getReg();
255
257 raw_svector_ostream OS(Str);
258 OS << "implicit-def: "
259 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(OS.str());
265 OutStreamer->addBlankLine();
266}
267
269 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
271 return;
272 }
273
274 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(SymbolName, &MF->getFunction()),
280 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
286 HexLines.emplace_back("");
287 }
288
290}
291
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
299 HexLines.emplace_back("");
300 }
302}
303
306 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
307 OutContext.reportError({},
308 Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
314 const Triple::OSType OS = TM.getTargetTriple().getOS();
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
327 Align Alignment = GV->getAlign().value_or(Align(4));
328
329 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(GVSym, Size, Alignment);
333 return;
334 }
335
337}
338
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343 switch (CodeObjectVersion) {
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError("unsupported code object version");
355 }
356 }
357
359}
360
361/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
362///
363/// Remove dependency on GCNSubtarget and depend only only the necessary values
364/// for said occupancy computation. Should match computeOccupancy implementation
365/// without passing \p STM on.
366const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
367 const MCExpr *NumVGPRs,
368 unsigned DynamicVGPRBlockSize,
369 const GCNSubtarget &STM, MCContext &Ctx) {
370 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
371 unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
372 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
373 unsigned Generation = STM.getGeneration();
374
375 auto CreateExpr = [&Ctx](unsigned Value) {
376 return MCConstantExpr::create(Value, Ctx);
377 };
378
380 {CreateExpr(MaxWaves), CreateExpr(Granule),
381 CreateExpr(TargetTotalNumVGPRs),
382 CreateExpr(Generation), CreateExpr(InitOcc),
383 NumSGPRs, NumVGPRs},
384 Ctx);
385}
386
387void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
388 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
389 return;
390
392 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
393 MCSymbol *FnSym = TM.getSymbol(&F);
394 bool IsLocal = F.hasLocalLinkage();
395
396 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
397 int64_t Val;
398 if (Value->evaluateAsAbsolute(Val)) {
399 Res = Val;
400 return true;
401 }
402 return false;
403 };
404
405 const uint64_t MaxScratchPerWorkitem =
407 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
408 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
409 uint64_t ScratchSize;
410 if (ScratchSizeSymbol->isVariable() &&
411 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
412 ScratchSize > MaxScratchPerWorkitem) {
413 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
414 DS_Error);
415 F.getContext().diagnose(DiagStackSize);
416 }
417
418 // Validate addressable scalar registers (i.e., prior to added implicit
419 // SGPRs).
420 MCSymbol *NumSGPRSymbol =
421 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
423 !STM.hasSGPRInitBug()) {
424 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
425 uint64_t NumSgpr;
426 if (NumSGPRSymbol->isVariable() &&
427 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
428 NumSgpr > MaxAddressableNumSGPRs) {
429 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
430 NumSgpr, MaxAddressableNumSGPRs,
432 F.getContext().diagnose(Diag);
433 return;
434 }
435 }
436
437 MCSymbol *VCCUsedSymbol =
438 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
439 MCSymbol *FlatUsedSymbol = RI.getSymbol(
440 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
441 uint64_t VCCUsed, FlatUsed, NumSgpr;
442
443 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
444 FlatUsedSymbol->isVariable() &&
445 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
446 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
447 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
448
449 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
450 // resolvable.
451 NumSgpr += IsaInfo::getNumExtraSGPRs(
452 &STM, VCCUsed, FlatUsed,
453 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
455 STM.hasSGPRInitBug()) {
456 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
457 if (NumSgpr > MaxAddressableNumSGPRs) {
458 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
459 MaxAddressableNumSGPRs, DS_Error,
461 F.getContext().diagnose(Diag);
462 return;
463 }
464 }
465
466 MCSymbol *NumVgprSymbol =
467 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
468 MCSymbol *NumAgprSymbol =
469 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
470 uint64_t NumVgpr, NumAgpr;
471
472 MachineModuleInfo &MMI =
474 MachineFunction *MF = MMI.getMachineFunction(F);
475 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
476 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
477 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
478 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
479 unsigned MaxWaves = MFI.getMaxWavesPerEU();
480 uint64_t TotalNumVgpr =
481 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
482 uint64_t NumVGPRsForWavesPerEU =
483 std::max({TotalNumVgpr, (uint64_t)1,
484 (uint64_t)STM.getMinNumVGPRs(
485 MaxWaves, MFI.getDynamicVGPRBlockSize())});
486 uint64_t NumSGPRsForWavesPerEU = std::max(
487 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
488 const MCExpr *OccupancyExpr = createOccupancy(
489 STM.getOccupancyWithWorkGroupSizes(*MF).second,
490 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
491 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
493 uint64_t Occupancy;
494
495 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
496 F, "amdgpu-waves-per-eu", {0, 0}, true);
497
498 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
499 DiagnosticInfoOptimizationFailure Diag(
500 F, F.getSubprogram(),
501 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
502 "'" +
503 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
504 ", final occupancy is " + Twine(Occupancy));
505 F.getContext().diagnose(Diag);
506 return;
507 }
508 }
509 }
510}
511
513 // Pad with s_code_end to help tools and guard against instruction prefetch
514 // causing stale data in caches. Arguably this should be done by the linker,
515 // which is why this isn't done for Mesa.
516 // Don't do it if there is no code.
517 const MCSubtargetInfo &STI = *getGlobalSTI();
518 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
522 if (TextSect->hasInstructions()) {
523 OutStreamer->switchSection(TextSect);
525 }
526 }
527
528 // Assign expressions which can only be resolved when all other functions are
529 // known.
530 RI.finalize(OutContext);
531
532 // Switch section and emit all GPR maximums within the processed module.
533 OutStreamer->pushSection();
534 MCSectionELF *MaxGPRSection =
535 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
536 OutStreamer->switchSection(MaxGPRSection);
538 RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
539 RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
540 OutStreamer->popSection();
541
542 for (Function &F : M.functions())
543 validateMCResourceInfo(F);
544
545 RI.reset();
546
548}
549
550SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
552 raw_svector_ostream OSS(Str);
553 auto &Streamer = getTargetStreamer()->getStreamer();
554 auto &Context = Streamer.getContext();
555 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
556 printAMDGPUMCExpr(New, OSS, MAI);
557 return Str;
558}
559
560// Print comments that apply to both callable functions and entry points.
561void AMDGPUAsmPrinter::emitCommonFunctionComments(
562 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
563 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
564 const AMDGPUMachineFunction *MFI) {
565 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
566 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
567 false);
568 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
569 if (NumAGPR && TotalNumVGPR) {
570 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
571 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
572 false);
573 }
574 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
575 false);
576 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
577 false);
578}
579
580const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
581 const MachineFunction &MF) const {
582 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
583 MCContext &Ctx = MF.getContext();
584 uint16_t KernelCodeProperties = 0;
585 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
586
587 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
588 KernelCodeProperties |=
589 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
590 }
591 if (UserSGPRInfo.hasDispatchPtr()) {
592 KernelCodeProperties |=
593 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
594 }
595 if (UserSGPRInfo.hasQueuePtr()) {
596 KernelCodeProperties |=
597 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
598 }
599 if (UserSGPRInfo.hasKernargSegmentPtr()) {
600 KernelCodeProperties |=
601 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
602 }
603 if (UserSGPRInfo.hasDispatchID()) {
604 KernelCodeProperties |=
605 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
606 }
607 if (UserSGPRInfo.hasFlatScratchInit()) {
608 KernelCodeProperties |=
609 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
610 }
611 if (UserSGPRInfo.hasPrivateSegmentSize()) {
612 KernelCodeProperties |=
613 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
614 }
615 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
616 KernelCodeProperties |=
617 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
618 }
619
620 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
621 // un-evaluatable at this point so it cannot be conditionally checked here.
622 // Instead, we'll directly shift the possibly unknown MCExpr into its place
623 // and bitwise-or it into KernelCodeProperties.
624 const MCExpr *KernelCodePropExpr =
625 MCConstantExpr::create(KernelCodeProperties, Ctx);
626 const MCExpr *OrValue = MCConstantExpr::create(
627 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
628 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
629 OrValue, Ctx);
630 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
631
632 return KernelCodePropExpr;
633}
634
635MCKernelDescriptor
636AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
637 const SIProgramInfo &PI) const {
638 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
639 const Function &F = MF.getFunction();
640 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
641 MCContext &Ctx = MF.getContext();
642
643 MCKernelDescriptor KernelDescriptor;
644
645 KernelDescriptor.group_segment_fixed_size =
647 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
648
649 Align MaxKernArgAlign;
650 KernelDescriptor.kernarg_size = MCConstantExpr::create(
651 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
652
653 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
654 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
655 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
656
657 int64_t PGRM_Rsrc3 = 1;
658 bool EvaluatableRsrc3 =
659 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGRM_Rsrc3);
660 (void)PGRM_Rsrc3;
661 (void)EvaluatableRsrc3;
663 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
664 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
665 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
666
667 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
668 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
669 Ctx);
670
671 return KernelDescriptor;
672}
673
675 // Init target streamer lazily on the first function so that previous passes
676 // can set metadata.
678 initTargetStreamer(*MF.getFunction().getParent());
679
680 ResourceUsage =
682 CurrentProgramInfo.reset(MF);
683
684 const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
685 MCContext &Ctx = MF.getContext();
686
687 // The starting address of all shader programs must be 256 bytes aligned.
688 // Regular functions just need the basic required instruction alignment.
689 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
690
692
693 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
695 bool IsLocal = MF.getFunction().hasLocalLinkage();
696 // FIXME: This should be an explicit check for Mesa.
697 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
698 MCSectionELF *ConfigSection =
699 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
700 OutStreamer->switchSection(ConfigSection);
701 }
702
703 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
704
705 if (MFI->isModuleEntryFunction()) {
706 getSIProgramInfo(CurrentProgramInfo, MF);
707 }
708
709 if (STM.isAmdPalOS()) {
710 if (MFI->isEntryFunction())
711 EmitPALMetadata(MF, CurrentProgramInfo);
712 else if (MFI->isModuleEntryFunction())
713 emitPALFunctionMetadata(MF);
714 } else if (!STM.isAmdHsaOS()) {
715 EmitProgramInfoSI(MF, CurrentProgramInfo);
716 }
717
718 DumpCodeInstEmitter = nullptr;
719 if (STM.dumpCode()) {
720 // For -dumpcode, get the assembler out of the streamer. This only works
721 // with -filetype=obj.
722 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
723 if (Assembler)
724 DumpCodeInstEmitter = Assembler->getEmitterPtr();
725 }
726
727 DisasmLines.clear();
728 HexLines.clear();
730
732
733 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
734 STM.hasMAIInsts());
735
736 {
739 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
740 IsLocal),
741 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
742 IsLocal),
743 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
744 IsLocal),
745 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
746 OutContext, IsLocal),
747 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
748 OutContext, IsLocal),
749 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
750 IsLocal),
751 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
752 OutContext, IsLocal),
753 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
754 OutContext, IsLocal),
755 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
756 IsLocal),
757 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
758 OutContext, IsLocal));
759 }
760
761 // Emit _dvgpr$ symbol when appropriate.
762 emitDVgprSymbol(MF);
763
764 if (isVerbose()) {
765 MCSectionELF *CommentSection =
766 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
767 OutStreamer->switchSection(CommentSection);
768
769 if (!MFI->isEntryFunction()) {
771 OutStreamer->emitRawComment(" Function info:", false);
772
773 emitCommonFunctionComments(
774 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
775 IsLocal)
776 ->getVariableValue(),
777 STM.hasMAIInsts()
778 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
779 OutContext, IsLocal)
780 ->getVariableValue()
781 : nullptr,
782 RI.createTotalNumVGPRs(MF, Ctx),
783 RI.createTotalNumSGPRs(
784 MF,
785 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
786 Ctx),
787 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
788 OutContext, IsLocal)
789 ->getVariableValue(),
790 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
791 return false;
792 }
793
794 OutStreamer->emitRawComment(" Kernel info:", false);
795 emitCommonFunctionComments(
796 CurrentProgramInfo.NumArchVGPR,
797 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
798 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
799 CurrentProgramInfo.ScratchSize,
800 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
801
802 OutStreamer->emitRawComment(
803 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
804 OutStreamer->emitRawComment(
805 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
806 OutStreamer->emitRawComment(
807 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
808 " bytes/workgroup (compile time only)", false);
809
810 OutStreamer->emitRawComment(
811 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
812
813 OutStreamer->emitRawComment(
814 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
815
816 OutStreamer->emitRawComment(
817 " NumSGPRsForWavesPerEU: " +
818 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
819 false);
820 OutStreamer->emitRawComment(
821 " NumVGPRsForWavesPerEU: " +
822 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
823 false);
824
825 if (STM.hasGFX90AInsts()) {
826 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
827 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
828 AdjustedAccum = MCBinaryExpr::createMul(
829 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
830 OutStreamer->emitRawComment(
831 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
832 }
833
834 if (STM.hasGFX1250Insts())
835 OutStreamer->emitRawComment(
836 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
837 false);
838
839 OutStreamer->emitRawComment(
840 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
841
842 OutStreamer->emitRawComment(
843 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
844
845 OutStreamer->emitRawComment(
846 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
847 getMCExprStr(CurrentProgramInfo.ScratchEnable),
848 false);
849 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
850 Twine(CurrentProgramInfo.UserSGPR),
851 false);
852 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
853 Twine(CurrentProgramInfo.TrapHandlerEnable),
854 false);
855 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
856 Twine(CurrentProgramInfo.TGIdXEnable),
857 false);
858 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
859 Twine(CurrentProgramInfo.TGIdYEnable),
860 false);
861 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
862 Twine(CurrentProgramInfo.TGIdZEnable),
863 false);
864 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
865 Twine(CurrentProgramInfo.TIdIGCompCount),
866 false);
867
868 [[maybe_unused]] int64_t PGMRSrc3;
870 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
871 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
872 static_cast<uint64_t>(PGMRSrc3) == 0));
873 if (STM.hasGFX90AInsts()) {
874 OutStreamer->emitRawComment(
875 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
876 getMCExprStr(MCKernelDescriptor::bits_get(
877 CurrentProgramInfo.ComputePGMRSrc3,
878 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
879 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
880 false);
881 OutStreamer->emitRawComment(
882 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
883 getMCExprStr(MCKernelDescriptor::bits_get(
884 CurrentProgramInfo.ComputePGMRSrc3,
885 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
886 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
887 false);
888 }
889 }
890
891 if (DumpCodeInstEmitter) {
892
893 OutStreamer->switchSection(
894 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
895
896 for (size_t i = 0; i < DisasmLines.size(); ++i) {
897 std::string Comment = "\n";
898 if (!HexLines[i].empty()) {
899 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
900 Comment += " ; " + HexLines[i] + "\n";
901 }
902
903 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
904 OutStreamer->emitBytes(StringRef(Comment));
905 }
906 }
907
908 return false;
909}
910
911// When appropriate, add a _dvgpr$ symbol, with the value of the function
912// symbol, plus an offset encoding one less than the number of VGPR blocks used
913// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
914// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
915// used by a front-end to have functions that are chained rather than called,
916// and a dispatcher that dynamically resizes the VGPR count before dispatching
917// to a function.
918void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
920 if (MFI.isDynamicVGPREnabled() &&
922 MCContext &Ctx = MF.getContext();
923 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
924 MCValue NumVGPRs;
925 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
926 NumVGPRs, nullptr) ||
927 !NumVGPRs.isAbsolute()) {
928 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
929 }
930 // Calculate number of VGPR blocks.
931 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
932 unsigned NumBlocks =
933 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
934
935 if (NumBlocks > 8) {
937 "too many DVGPR blocks for _dvgpr$ symbol for '" +
938 Twine(CurrentFnSym->getName()) + "'");
939 return;
940 }
941 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
942 // Add to function symbol to create _dvgpr$ symbol.
943 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
945 MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
946 MCSymbol *DVgprFuncSym =
947 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
948 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
949 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
950 emitLinkage(&MF.getFunction(), DVgprFuncSym);
951 }
952}
953
954// TODO: Fold this into emitFunctionBodyStart.
955void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
956 // In the beginning all features are either 'Any' or 'NotSupported',
957 // depending on global target features. This will cover empty modules.
959 getGlobalSTI()->getFeatureString());
960
961 // If module is empty, we are done.
962 if (M.empty())
963 return;
964
965 // If module is not empty, need to find first 'Off' or 'On' feature
966 // setting per feature from functions in module.
967 for (auto &F : M) {
968 auto &TSTargetID = getTargetStreamer()->getTargetID();
969 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
970 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
971 break;
972
973 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
974 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
975 if (TSTargetID->isXnackSupported())
976 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
977 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
978 if (TSTargetID->isSramEccSupported())
979 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
980 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
981 }
982}
983
984// AccumOffset computed for the MCExpr equivalent of:
985// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
986static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
987 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
988 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
989
990 // Can't be lower than 1 for subsequent alignTo.
991 const MCExpr *MaximumTaken =
992 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
993
994 // Practically, it's computing divideCeil(MaximumTaken, 4).
995 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
996 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
997 Ctx);
998
999 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
1000}
1001
1002void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1003 const MachineFunction &MF) {
1004 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1005 bool IsLocal = MF.getFunction().hasLocalLinkage();
1006 MCContext &Ctx = MF.getContext();
1007
1008 auto CreateExpr = [&Ctx](int64_t Value) {
1009 return MCConstantExpr::create(Value, Ctx);
1010 };
1011
1012 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1013 int64_t Val;
1014 if (Value->evaluateAsAbsolute(Val)) {
1015 Res = Val;
1016 return true;
1017 }
1018 return false;
1019 };
1020
1021 auto GetSymRefExpr =
1022 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1023 MCSymbol *Sym =
1024 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
1025 return MCSymbolRefExpr::create(Sym, Ctx);
1026 };
1027
1029 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1030 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1032 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1033
1034 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1035 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1036 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1037 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1038 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1039 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1040 ProgInfo.DynamicCallStack =
1041 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1042 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1043
1044 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1045 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1046 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1047 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1048
1049 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1050
1051 // The calculations related to SGPR/VGPR blocks are
1052 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1053 // unified.
1054 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1055 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1056 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1057
1058 // Check the addressable register limit before we add ExtraSGPRs.
1060 !STM.hasSGPRInitBug()) {
1061 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1062 uint64_t NumSgpr;
1063 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1064 NumSgpr > MaxAddressableNumSGPRs) {
1065 // This can happen due to a compiler bug or when using inline asm.
1066 LLVMContext &Ctx = MF.getFunction().getContext();
1067 DiagnosticInfoResourceLimit Diag(
1068 MF.getFunction(), "addressable scalar registers", NumSgpr,
1069 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
1070 Ctx.diagnose(Diag);
1071 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1072 }
1073 }
1074
1075 // Account for extra SGPRs and VGPRs reserved for debugger use.
1076 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1077
1078 const Function &F = MF.getFunction();
1079
1080 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1081 // dispatch registers as function args.
1082 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1083 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1084
1085 if (WaveDispatchNumSGPR) {
1087 {ProgInfo.NumSGPR,
1088 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1089 Ctx)},
1090 Ctx);
1091 }
1092
1093 if (WaveDispatchNumVGPR) {
1095 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1096
1098 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1099 }
1100
1101 // Adjust number of registers used to meet default/requested minimum/maximum
1102 // number of waves per execution unit request.
1103 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1104 ProgInfo.NumSGPRsForWavesPerEU =
1105 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1106 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1107 Ctx);
1108 ProgInfo.NumVGPRsForWavesPerEU =
1109 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1110 CreateExpr(STM.getMinNumVGPRs(
1111 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1112 Ctx);
1113
1115 STM.hasSGPRInitBug()) {
1116 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1117 uint64_t NumSgpr;
1118 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1119 NumSgpr > MaxAddressableNumSGPRs) {
1120 // This can happen due to a compiler bug or when using inline asm to use
1121 // the registers which are usually reserved for vcc etc.
1122 LLVMContext &Ctx = MF.getFunction().getContext();
1123 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1124 NumSgpr, MaxAddressableNumSGPRs,
1126 Ctx.diagnose(Diag);
1127 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1128 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1129 }
1130 }
1131
1132 if (STM.hasSGPRInitBug()) {
1133 ProgInfo.NumSGPR =
1135 ProgInfo.NumSGPRsForWavesPerEU =
1137 }
1138
1139 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1140 LLVMContext &Ctx = MF.getFunction().getContext();
1141 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1142 MFI->getNumUserSGPRs(),
1144 Ctx.diagnose(Diag);
1145 }
1146
1147 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1148 LLVMContext &Ctx = MF.getFunction().getContext();
1149 DiagnosticInfoResourceLimit Diag(
1150 MF.getFunction(), "local memory", MFI->getLDSSize(),
1152 Ctx.diagnose(Diag);
1153 }
1154 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1155 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1156 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1157 unsigned Granule) {
1158 const MCExpr *OneConst = CreateExpr(1ul);
1159 const MCExpr *GranuleConst = CreateExpr(Granule);
1160 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1161 const MCExpr *AlignToGPR =
1162 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1163 const MCExpr *DivGPR =
1164 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1165 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1166 return SubGPR;
1167 };
1168 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1170 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1171 } else {
1172 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1174 }
1175 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1177
1178 const SIModeRegisterDefaults Mode = MFI->getMode();
1179
1180 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1181 // register.
1182 ProgInfo.FloatMode = getFPMode(Mode);
1183
1184 ProgInfo.IEEEMode = Mode.IEEE;
1185
1186 // Make clamp modifier on NaN input returns 0.
1187 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1188
1189 unsigned LDSAlignShift = 8;
1190 switch (getLdsDwGranularity(STM)) {
1191 case 512:
1192 case 320:
1193 LDSAlignShift = 11;
1194 break;
1195 case 128:
1196 LDSAlignShift = 9;
1197 break;
1198 case 64:
1199 LDSAlignShift = 8;
1200 break;
1201 default:
1202 llvm_unreachable("invald LDS block size");
1203 }
1204
1205 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1206 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1207
1208 ProgInfo.LDSSize = MFI->getLDSSize();
1209 ProgInfo.LDSBlocks =
1210 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1211
1212 // The MCExpr equivalent of divideCeil.
1213 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1214 const MCExpr *Ceil =
1215 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1216 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1217 };
1218
1219 // Scratch is allocated in 64-dword or 256-dword blocks.
1220 unsigned ScratchAlignShift =
1221 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1222 // We need to program the hardware with the amount of scratch memory that
1223 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1224 // scratch memory used per thread.
1225 ProgInfo.ScratchBlocks = DivideCeil(
1227 CreateExpr(STM.getWavefrontSize()), Ctx),
1228 CreateExpr(1ULL << ScratchAlignShift));
1229
1230 if (STM.supportsWGP()) {
1231 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1232 }
1233
1234 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1235 ProgInfo.MemOrdered = 1;
1236 ProgInfo.FwdProgress = 1;
1237 }
1238
1239 // 0 = X, 1 = XY, 2 = XYZ
1240 unsigned TIDIGCompCnt = 0;
1241 if (MFI->hasWorkItemIDZ())
1242 TIDIGCompCnt = 2;
1243 else if (MFI->hasWorkItemIDY())
1244 TIDIGCompCnt = 1;
1245
1246 // The private segment wave byte offset is the last of the system SGPRs. We
1247 // initially assumed it was allocated, and may have used it. It shouldn't harm
1248 // anything to disable it if we know the stack isn't used here. We may still
1249 // have emitted code reading it to initialize scratch, but if that's unused
1250 // reading garbage should be OK.
1253 MCConstantExpr::create(0, Ctx), Ctx),
1254 ProgInfo.DynamicCallStack, Ctx);
1255
1256 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1257 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1258 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1259 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1260 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1261 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1262 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1263 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1264 ProgInfo.EXCPEnMSB = 0;
1265 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1266 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1267 ProgInfo.EXCPEnable = 0;
1268
1269 // return ((Dst & ~Mask) | (Value << Shift))
1270 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1271 uint32_t Shift) {
1272 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1273 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1274 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1276 Ctx);
1277 return Dst;
1278 };
1279
1280 if (STM.hasGFX90AInsts()) {
1281 ProgInfo.ComputePGMRSrc3 =
1282 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1283 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1284 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1285 ProgInfo.ComputePGMRSrc3 =
1286 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1287 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1288 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1289 }
1290
1291 if (STM.hasGFX1250Insts())
1292 ProgInfo.ComputePGMRSrc3 =
1293 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1294 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1295 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1296
1297 ProgInfo.Occupancy = createOccupancy(
1298 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1300 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1301
1302 const auto [MinWEU, MaxWEU] =
1303 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1304 uint64_t Occupancy;
1305 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1306 DiagnosticInfoOptimizationFailure Diag(
1307 F, F.getSubprogram(),
1308 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1309 "'" +
1310 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1311 ", final occupancy is " + Twine(Occupancy));
1312 F.getContext().diagnose(Diag);
1313 }
1314
1315 if (isGFX11Plus(STM)) {
1316 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1317 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1318 (uint64_t)std::numeric_limits<uint32_t>::max());
1319 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1320 uint32_t Field, Shift, Width;
1321 if (isGFX11(STM)) {
1322 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1323 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1324 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1325 } else {
1326 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1327 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1328 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1329 }
1330 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1331 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1332 CreateExpr(InstPrefSize), Field, Shift);
1333 }
1334}
1335
1348
1349void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1350 const SIProgramInfo &CurrentProgramInfo) {
1351 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1352 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1353 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1354 MCContext &Ctx = MF.getContext();
1355
1356 // (((Value) & Mask) << Shift)
1357 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1358 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1359 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1361 shft, Ctx);
1362 };
1363
1364 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1365 int64_t Val;
1366 if (Value->evaluateAsAbsolute(Val))
1367 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1368 else
1369 OutStreamer->emitValue(Value, Size);
1370 };
1371
1372 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1374
1375 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1376 /*Size=*/4);
1377
1379 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1380
1382
1383 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1384 // appropriate generation.
1385 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1386 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1387 /*Mask=*/0x3FFFF, /*Shift=*/12),
1388 /*Size=*/4);
1389 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1390 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1391 /*Mask=*/0x7FFF, /*Shift=*/12),
1392 /*Size=*/4);
1393 } else {
1394 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1395 /*Mask=*/0x1FFF, /*Shift=*/12),
1396 /*Size=*/4);
1397 }
1398
1399 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1400 // 0" comment but I don't see a corresponding field in the register spec.
1401 } else {
1402 OutStreamer->emitInt32(RsrcReg);
1403
1404 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1405 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1406 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1407 MF.getContext());
1408 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1410
1411 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1412 // appropriate generation.
1413 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1414 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1415 /*Mask=*/0x3FFFF, /*Shift=*/12),
1416 /*Size=*/4);
1417 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1418 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1419 /*Mask=*/0x7FFF, /*Shift=*/12),
1420 /*Size=*/4);
1421 } else {
1422 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1423 /*Mask=*/0x1FFF, /*Shift=*/12),
1424 /*Size=*/4);
1425 }
1426 }
1427
1428 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1430 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1431 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1432 : CurrentProgramInfo.LDSBlocks;
1433 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1435 OutStreamer->emitInt32(MFI->getPSInputEnable());
1437 OutStreamer->emitInt32(MFI->getPSInputAddr());
1438 }
1439
1440 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1441 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1442 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1443 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1444}
1445
1446// Helper function to add common PAL Metadata 3.0+
1448 const SIProgramInfo &CurrentProgramInfo,
1449 CallingConv::ID CC, const GCNSubtarget &ST,
1450 unsigned DynamicVGPRBlockSize) {
1451 if (ST.hasIEEEMode())
1452 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1453
1454 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1455 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1456 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1457
1458 if (AMDGPU::isCompute(CC)) {
1459 MD->setHwStage(CC, ".trap_present",
1460 (bool)CurrentProgramInfo.TrapHandlerEnable);
1461 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1462
1463 if (DynamicVGPRBlockSize != 0)
1464 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1465 }
1466
1468 CC, ".lds_size",
1469 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1470 sizeof(uint32_t)));
1471}
1472
1473// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1474// is AMDPAL. It stores each compute/SPI register setting and other PAL
1475// metadata items into the PALMD::Metadata, combining with any provided by the
1476// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1477// is then written as a single block in the .note section.
1478void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1479 const SIProgramInfo &CurrentProgramInfo) {
1480 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1481 auto CC = MF.getFunction().getCallingConv();
1482 auto *MD = getTargetStreamer()->getPALMetadata();
1483 auto &Ctx = MF.getContext();
1484
1485 MD->setEntryPoint(CC, MF.getFunction().getName());
1486 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1487
1488 // For targets that support dynamic VGPRs, set the number of saved dynamic
1489 // VGPRs (if any) in the PAL metadata.
1490 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1491 if (MFI->isDynamicVGPREnabled() &&
1493 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1495
1496 // Only set AGPRs for supported devices
1497 if (STM.hasMAIInsts()) {
1498 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1499 }
1500
1501 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1502 if (MD->getPALMajorVersion() < 3) {
1503 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1504 if (AMDGPU::isCompute(CC)) {
1505 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1506 } else {
1507 const MCExpr *HasScratchBlocks =
1508 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1509 MCConstantExpr::create(0, Ctx), Ctx);
1510 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1511 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1512 }
1513 } else {
1514 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1515 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1516 CurrentProgramInfo.ScratchEnable);
1517 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1519 }
1520
1521 // ScratchSize is in bytes, 16 aligned.
1522 MD->setScratchSize(
1523 CC,
1524 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1525 MCConstantExpr::create(16, Ctx), Ctx),
1526 Ctx);
1527
1528 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1529 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1530 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1531 : CurrentProgramInfo.LDSBlocks;
1532 if (MD->getPALMajorVersion() < 3) {
1533 MD->setRsrc2(
1534 CC,
1536 Ctx);
1537 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1538 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1539 } else {
1540 // Graphics registers
1541 const unsigned ExtraLdsDwGranularity =
1542 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1543 MD->setGraphicsRegisters(
1544 ".ps_extra_lds_size",
1545 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1546
1547 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1548 static StringLiteral const PsInputFields[] = {
1549 ".persp_sample_ena", ".persp_center_ena",
1550 ".persp_centroid_ena", ".persp_pull_model_ena",
1551 ".linear_sample_ena", ".linear_center_ena",
1552 ".linear_centroid_ena", ".line_stipple_tex_ena",
1553 ".pos_x_float_ena", ".pos_y_float_ena",
1554 ".pos_z_float_ena", ".pos_w_float_ena",
1555 ".front_face_ena", ".ancillary_ena",
1556 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1557 unsigned PSInputEna = MFI->getPSInputEnable();
1558 unsigned PSInputAddr = MFI->getPSInputAddr();
1559 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1560 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1561 (bool)((PSInputEna >> Idx) & 1));
1562 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1563 (bool)((PSInputAddr >> Idx) & 1));
1564 }
1565 }
1566 }
1567
1568 // For version 3 and above the wave front size is already set in the metadata
1569 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1570 MD->setWave32(MF.getFunction().getCallingConv());
1571}
1572
1573void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1574 auto *MD = getTargetStreamer()->getPALMetadata();
1575 const MachineFrameInfo &MFI = MF.getFrameInfo();
1576 StringRef FnName = MF.getFunction().getName();
1577 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1578 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1579 MCContext &Ctx = MF.getContext();
1580
1581 if (MD->getPALMajorVersion() < 3) {
1582 // Set compute registers
1583 MD->setRsrc1(
1585 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1586 MD->setRsrc2(CallingConv::AMDGPU_CS,
1587 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1588 } else {
1590 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1591 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1592 }
1593
1594 // Set optional info
1595 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1596 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1597 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1598}
1599
1600// This is supposed to be log2(Size)
1602 switch (Size) {
1603 case 4:
1604 return AMD_ELEMENT_4_BYTES;
1605 case 8:
1606 return AMD_ELEMENT_8_BYTES;
1607 case 16:
1608 return AMD_ELEMENT_16_BYTES;
1609 default:
1610 llvm_unreachable("invalid private_element_size");
1611 }
1612}
1613
1614void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1615 const SIProgramInfo &CurrentProgramInfo,
1616 const MachineFunction &MF) const {
1617 const Function &F = MF.getFunction();
1618 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1619 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1620
1621 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1622 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1623 MCContext &Ctx = MF.getContext();
1624
1625 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1626
1628 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1630 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1632
1633 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1634
1636 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1637
1638 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1639 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1641 }
1642
1643 if (UserSGPRInfo.hasDispatchPtr())
1645
1646 if (UserSGPRInfo.hasQueuePtr())
1648
1649 if (UserSGPRInfo.hasKernargSegmentPtr())
1651
1652 if (UserSGPRInfo.hasDispatchID())
1654
1655 if (UserSGPRInfo.hasFlatScratchInit())
1657
1658 if (UserSGPRInfo.hasPrivateSegmentSize())
1660
1661 if (STM.isXNACKEnabled())
1663
1664 Align MaxKernArgAlign;
1665 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1666 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1667 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1668 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1669 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1670
1671 // kernarg_segment_alignment is specified as log of the alignment.
1672 // The minimum alignment is 16.
1673 // FIXME: The metadata treats the minimum as 4?
1674 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1675}
1676
1678 const char *ExtraCode, raw_ostream &O) {
1679 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1680 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1681 return false;
1682
1683 if (ExtraCode && ExtraCode[0]) {
1684 if (ExtraCode[1] != 0)
1685 return true; // Unknown modifier.
1686
1687 switch (ExtraCode[0]) {
1688 case 'r':
1689 break;
1690 default:
1691 return true;
1692 }
1693 }
1694
1695 // TODO: Should be able to support other operand types like globals.
1696 const MachineOperand &MO = MI->getOperand(OpNo);
1697 if (MO.isReg()) {
1699 *MF->getSubtarget().getRegisterInfo());
1700 return false;
1701 }
1702 if (MO.isImm()) {
1703 int64_t Val = MO.getImm();
1705 O << Val;
1706 } else if (isUInt<16>(Val)) {
1707 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1708 } else if (isUInt<32>(Val)) {
1709 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1710 } else {
1711 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1712 }
1713 return false;
1714 }
1715 return true;
1716}
1717
1725
1726void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1727 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1728 bool isModuleEntryFunction, bool hasMAIInsts) {
1729 if (!ORE)
1730 return;
1731
1732 const char *Name = "kernel-resource-usage";
1733 const char *Indent = " ";
1734
1735 // If the remark is not specifically enabled, do not output to yaml
1737 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1738 return;
1739
1740 // Currently non-kernel functions have no resources to emit.
1742 return;
1743
1744 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1745 StringRef RemarkLabel, auto Argument) {
1746 // Add an indent for every line besides the line with the kernel name. This
1747 // makes it easier to tell which resource usage go with which kernel since
1748 // the kernel name will always be displayed first.
1749 std::string LabelStr = RemarkLabel.str() + ": ";
1750 if (RemarkName != "FunctionName")
1751 LabelStr = Indent + LabelStr;
1752
1753 ORE->emit([&]() {
1754 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1756 &MF.front())
1757 << LabelStr << ore::NV(RemarkName, Argument);
1758 });
1759 };
1760
1761 // FIXME: Formatting here is pretty nasty because clang does not accept
1762 // newlines from diagnostics. This forces us to emit multiple diagnostic
1763 // remarks to simulate newlines. If and when clang does accept newlines, this
1764 // formatting should be aggregated into one remark with newlines to avoid
1765 // printing multiple diagnostic location and diag opts.
1766 EmitResourceUsageRemark("FunctionName", "Function Name",
1767 MF.getFunction().getName());
1768 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1769 getMCExprStr(CurrentProgramInfo.NumSGPR));
1770 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1771 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1772 if (hasMAIInsts) {
1773 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1774 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1775 }
1776 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1777 getMCExprStr(CurrentProgramInfo.ScratchSize));
1778 int64_t DynStack;
1779 bool DynStackEvaluatable =
1780 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1781 StringRef DynamicStackStr =
1782 DynStackEvaluatable && DynStack ? "True" : "False";
1783 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1784 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1785 getMCExprStr(CurrentProgramInfo.Occupancy));
1786 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1787 CurrentProgramInfo.SGPRSpill);
1788 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1789 CurrentProgramInfo.VGPRSpill);
1790 if (isModuleEntryFunction)
1791 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1792 CurrentProgramInfo.LDSSize);
1793}
1794
1795char AMDGPUAsmPrinter::ID = 0;
1796
1797INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1798 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1135
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1273
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1255
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1171
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1247
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1206
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1268
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1158
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1157
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1166
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1205
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1144
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1266
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1208
#define R_SPILLED_SGPRS
Definition SIDefines.h:1287
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1254
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1265
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1149
#define R_SPILLED_VGPRS
Definition SIDefines.h:1288
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1143
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1168
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1142
static const int BlockSize
Definition TarWriter.cpp:33
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, const MCSymbol *MaxNamedBarrier)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:91
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:94
const MCAsmInfo * MAI
Target Asm Printer information.
Definition AsmPrinter.h:97
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:109
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:121
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:128
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:112
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:101
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:106
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:295
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isWave32() const
bool supportsWGP() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:329
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:133
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:416
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:517
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:605
bool hasInstructions() const
Definition MCSection.h:626
MCContext & getContext() const
Definition MCStreamer.h:314
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:101
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:273
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:427
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1147
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1430
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
uint64_t getFunctionCodeSize(const MachineFunction &MF, bool IsLowerBound=false)
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.