LLVM 19.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "GCNSubtarget.h"
27#include "R600AsmPrinter.h"
38#include "llvm/MC/MCAssembler.h"
39#include "llvm/MC/MCContext.h"
41#include "llvm/MC/MCStreamer.h"
47
48using namespace llvm;
49using namespace llvm::AMDGPU;
50
51// This should get the default rounding mode from the kernel. We just set the
52// default here, but this could change if the OpenCL rounding mode pragmas are
53// used.
54//
55// The denormal mode here should match what is reported by the OpenCL runtime
56// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
57// can also be override to flush with the -cl-denorms-are-zero compiler flag.
58//
59// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
60// precision, and leaves single precision to flush all and does not report
61// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
62// CL_FP_DENORM for both.
63//
64// FIXME: It seems some instructions do not support single precision denormals
65// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
66// and sin_f32, cos_f32 on most parts).
67
68// We want to use these instructions, and using fp32 denormals also causes
69// instructions to run at the double precision rate for the device so it's
70// probably best to just report no single precision denormals.
74 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
75 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
76}
77
78static AsmPrinter *
80 std::unique_ptr<MCStreamer> &&Streamer) {
81 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
82}
83
89}
90
92 std::unique_ptr<MCStreamer> Streamer)
93 : AsmPrinter(TM, std::move(Streamer)) {
94 assert(OutStreamer && "AsmPrinter constructed without streamer");
95}
96
98 return "AMDGPU Assembly Printer";
99}
100
102 return TM.getMCSubtargetInfo();
103}
104
106 if (!OutStreamer)
107 return nullptr;
108 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
109}
110
113}
114
115void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
117
118 // TODO: Which one is called first, emitStartOfAsmFile or
119 // emitFunctionBodyStart?
120 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
121 initializeTargetID(M);
122
125 return;
126
128
131 CodeObjectVersion);
132 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
133 }
134
137}
138
139uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
140 int64_t Val;
141 if (!Value->evaluateAsAbsolute(Val)) {
142 Ctx.reportError(SMLoc(), "could not resolve expression when required.");
143 return 0;
144 }
145 return static_cast<uint64_t>(Val);
146}
147
149 // Init target streamer if it has not yet happened
151 initTargetStreamer(M);
152
155
156 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
157 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
159 HSAMetadataStream->end();
160 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
161 (void)Success;
162 assert(Success && "Malformed HSA Metadata");
163 }
164}
165
168 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
169 const Function &F = MF->getFunction();
170
171 // TODO: We're checking this late, would be nice to check it earlier.
172 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
174 STM.getCPU() + " is only available on code object version 6 or better",
175 /*gen_crash_diag*/ false);
176 }
177
178 // TODO: Which one is called first, emitStartOfAsmFile or
179 // emitFunctionBodyStart?
180 if (!getTargetStreamer()->getTargetID())
181 initializeTargetID(*F.getParent());
182
183 const auto &FunctionTargetID = STM.getTargetID();
184 // Make sure function's xnack settings are compatible with module's
185 // xnack settings.
186 if (FunctionTargetID.isXnackSupported() &&
187 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
188 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
189 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
190 "' function does not match module xnack setting");
191 return;
192 }
193 // Make sure function's sramecc settings are compatible with module's
194 // sramecc settings.
195 if (FunctionTargetID.isSramEccSupported() &&
196 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
197 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
198 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
199 "' function does not match module sramecc setting");
200 return;
201 }
202
203 if (!MFI.isEntryFunction())
204 return;
205
206 if (STM.isMesaKernel(F) &&
207 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
208 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
209 AMDGPUMCKernelCodeT KernelCode;
210 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
211 KernelCode.validate(&STM, MF->getContext());
213 }
214
215 if (STM.isAmdHsaOS())
216 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
217
218 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
221 STM.isAmdHsaOS());
222 }
223}
224
227 if (!MFI.isEntryFunction())
228 return;
229
231 return;
232
233 auto &Streamer = getTargetStreamer()->getStreamer();
234 auto &Context = Streamer.getContext();
235 auto &ObjectFileInfo = *Context.getObjectFileInfo();
236 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
237
238 Streamer.pushSection();
239 Streamer.switchSection(&ReadOnlySection);
240
241 // CP microcode requires the kernel descriptor to be allocated on 64 byte
242 // alignment.
243 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
244 ReadOnlySection.ensureMinAlignment(Align(64));
245
246 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
247
248 SmallString<128> KernelName;
249 getNameWithPrefix(KernelName, &MF->getFunction());
251 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
252 getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
253 getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
255 &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
256 getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
257 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
258 getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
259 getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
260
261 Streamer.popSection();
262}
263
265 Register RegNo = MI->getOperand(0).getReg();
266
269 OS << "implicit-def: "
270 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
271
272 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
273 OS << " : SGPR spill to VGPR lane";
274
275 OutStreamer->AddComment(OS.str());
276 OutStreamer->addBlankLine();
277}
278
282 return;
283 }
284
286 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
287 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
288 SmallString<128> SymbolName;
289 getNameWithPrefix(SymbolName, &MF->getFunction()),
291 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
292 }
293 if (DumpCodeInstEmitter) {
294 // Disassemble function name label to text.
295 DisasmLines.push_back(MF->getName().str() + ":");
296 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
297 HexLines.push_back("");
298 }
299
301}
302
304 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
305 // Write a line for the basic block label if it is not only fallthrough.
306 DisasmLines.push_back(
307 (Twine("BB") + Twine(getFunctionNumber())
308 + "_" + Twine(MBB.getNumber()) + ":").str());
309 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
310 HexLines.push_back("");
311 }
313}
314
317 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
319 Twine(GV->getName()) +
320 ": unsupported initializer for address space");
321 return;
322 }
323
324 // LDS variables aren't emitted in HSA or PAL yet.
326 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
327 return;
328
329 MCSymbol *GVSym = getSymbol(GV);
330
331 GVSym->redefineIfPossible();
332 if (GVSym->isDefined() || GVSym->isVariable())
333 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
334 "' is already defined");
335
336 const DataLayout &DL = GV->getParent()->getDataLayout();
337 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
338 Align Alignment = GV->getAlign().value_or(Align(4));
339
340 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
341 emitLinkage(GV, GVSym);
342 auto TS = getTargetStreamer();
343 TS->emitAMDGPULDS(GVSym, Size, Alignment);
344 return;
345 }
346
348}
349
351 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
352
354 switch (CodeObjectVersion) {
356 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
357 break;
359 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
360 break;
362 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV6());
363 break;
364 default:
365 report_fatal_error("Unexpected code object version");
366 }
367 }
369}
370
372 // Pad with s_code_end to help tools and guard against instruction prefetch
373 // causing stale data in caches. Arguably this should be done by the linker,
374 // which is why this isn't done for Mesa.
375 const MCSubtargetInfo &STI = *getGlobalSTI();
376 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
379 OutStreamer->switchSection(getObjFileLowering().getTextSection());
381 }
382
384}
385
386// Print comments that apply to both callable functions and entry points.
387void AMDGPUAsmPrinter::emitCommonFunctionComments(
388 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
389 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
390 const AMDGPUMachineFunction *MFI) {
391 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
392 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
393 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
394 if (NumAGPR) {
395 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
396 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
397 false);
398 }
399 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
400 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
401 false);
402}
403
404SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
406 raw_svector_ostream OSS(Str);
407 int64_t IVal;
408 if (Value->evaluateAsAbsolute(IVal)) {
409 OSS << static_cast<uint64_t>(IVal);
410 } else {
411 Value->print(OSS, MAI);
412 }
413 return Str;
414}
415
416void AMDGPUAsmPrinter::emitCommonFunctionComments(
417 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
418 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
419 const AMDGPUMachineFunction *MFI) {
420 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
421 OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
422 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
423 if (NumAGPR && TotalNumVGPR) {
424 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
425 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
426 false);
427 }
428 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
429 false);
430 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
431 false);
432}
433
434uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
435 const MachineFunction &MF) const {
437 uint16_t KernelCodeProperties = 0;
438 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
439
440 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
441 KernelCodeProperties |=
442 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
443 }
444 if (UserSGPRInfo.hasDispatchPtr()) {
445 KernelCodeProperties |=
446 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
447 }
448 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
449 KernelCodeProperties |=
450 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
451 }
452 if (UserSGPRInfo.hasKernargSegmentPtr()) {
453 KernelCodeProperties |=
454 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
455 }
456 if (UserSGPRInfo.hasDispatchID()) {
457 KernelCodeProperties |=
458 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
459 }
460 if (UserSGPRInfo.hasFlatScratchInit()) {
461 KernelCodeProperties |=
462 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
463 }
465 KernelCodeProperties |=
466 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
467 }
468
469 if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
470 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
471 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
472
473 return KernelCodeProperties;
474}
475
477AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
478 const SIProgramInfo &PI) const {
480 const Function &F = MF.getFunction();
482 MCContext &Ctx = MF.getContext();
483
484 MCKernelDescriptor KernelDescriptor;
485
486 KernelDescriptor.group_segment_fixed_size =
488 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
489
490 Align MaxKernArgAlign;
491 KernelDescriptor.kernarg_size = MCConstantExpr::create(
492 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
493
494 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
495 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
496 KernelDescriptor.kernel_code_properties =
497 MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
498
499 assert(STM.hasGFX90AInsts() ||
500 getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
501 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
502
503 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
504 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
505 Ctx);
506
507 return KernelDescriptor;
508}
509
511 // Init target streamer lazily on the first function so that previous passes
512 // can set metadata.
514 initTargetStreamer(*MF.getFunction().getParent());
515
516 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
517 CurrentProgramInfo.reset(MF);
518
520 MCContext &Ctx = MF.getContext();
521
522 // The starting address of all shader programs must be 256 bytes aligned.
523 // Regular functions just need the basic required instruction alignment.
524 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
525
527
530 // FIXME: This should be an explicit check for Mesa.
531 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
532 MCSectionELF *ConfigSection =
533 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
534 OutStreamer->switchSection(ConfigSection);
535 }
536
537 if (MFI->isModuleEntryFunction()) {
538 getSIProgramInfo(CurrentProgramInfo, MF);
539 }
540
541 if (STM.isAmdPalOS()) {
542 if (MFI->isEntryFunction())
543 EmitPALMetadata(MF, CurrentProgramInfo);
544 else if (MFI->isModuleEntryFunction())
545 emitPALFunctionMetadata(MF);
546 } else if (!STM.isAmdHsaOS()) {
547 EmitProgramInfoSI(MF, CurrentProgramInfo);
548 }
549
550 DumpCodeInstEmitter = nullptr;
551 if (STM.dumpCode()) {
552 // For -dumpcode, get the assembler out of the streamer. This only works
553 // with -filetype=obj.
554 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
555 if (Assembler)
556 DumpCodeInstEmitter = Assembler->getEmitterPtr();
557 }
558
559 DisasmLines.clear();
560 HexLines.clear();
562
564
565 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
566 STM.hasMAIInsts());
567
568 if (isVerbose()) {
569 MCSectionELF *CommentSection =
570 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
571 OutStreamer->switchSection(CommentSection);
572
573 if (!MFI->isEntryFunction()) {
574 OutStreamer->emitRawComment(" Function info:", false);
576 ResourceUsage->getResourceInfo(&MF.getFunction());
577 emitCommonFunctionComments(
578 Info.NumVGPR,
579 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
580 Info.getTotalNumVGPRs(STM),
581 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
582 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
583 return false;
584 }
585
586 OutStreamer->emitRawComment(" Kernel info:", false);
587 emitCommonFunctionComments(
588 CurrentProgramInfo.NumArchVGPR,
589 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
590 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
591 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
592
593 OutStreamer->emitRawComment(
594 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
595 OutStreamer->emitRawComment(
596 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
597 OutStreamer->emitRawComment(
598 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
599 " bytes/workgroup (compile time only)", false);
600
601 OutStreamer->emitRawComment(
602 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
603
604 OutStreamer->emitRawComment(
605 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
606
607 OutStreamer->emitRawComment(
608 " NumSGPRsForWavesPerEU: " +
609 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
610 false);
611 OutStreamer->emitRawComment(
612 " NumVGPRsForWavesPerEU: " +
613 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
614 false);
615
616 if (STM.hasGFX90AInsts()) {
617 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
618 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
619 AdjustedAccum = MCBinaryExpr::createMul(
620 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
621 OutStreamer->emitRawComment(
622 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
623 }
624
625 OutStreamer->emitRawComment(
626 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
627
628 OutStreamer->emitRawComment(
629 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
630
631 OutStreamer->emitRawComment(
632 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
633 getMCExprStr(CurrentProgramInfo.ScratchEnable),
634 false);
635 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
636 Twine(CurrentProgramInfo.UserSGPR),
637 false);
638 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
639 Twine(CurrentProgramInfo.TrapHandlerEnable),
640 false);
641 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
642 Twine(CurrentProgramInfo.TGIdXEnable),
643 false);
644 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
645 Twine(CurrentProgramInfo.TGIdYEnable),
646 false);
647 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
648 Twine(CurrentProgramInfo.TGIdZEnable),
649 false);
650 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
651 Twine(CurrentProgramInfo.TIdIGCompCount),
652 false);
653
654 [[maybe_unused]] int64_t PGMRSrc3;
655 assert(STM.hasGFX90AInsts() ||
656 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
657 PGMRSrc3) &&
658 static_cast<uint64_t>(PGMRSrc3) == 0));
659 if (STM.hasGFX90AInsts()) {
660 OutStreamer->emitRawComment(
661 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
662 getMCExprStr(MCKernelDescriptor::bits_get(
663 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
664 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
665 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
666 false);
667 OutStreamer->emitRawComment(
668 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
669 getMCExprStr(MCKernelDescriptor::bits_get(
670 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
671 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
672 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
673 false);
674 }
675 }
676
677 if (DumpCodeInstEmitter) {
678
679 OutStreamer->switchSection(
680 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
681
682 for (size_t i = 0; i < DisasmLines.size(); ++i) {
683 std::string Comment = "\n";
684 if (!HexLines[i].empty()) {
685 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
686 Comment += " ; " + HexLines[i] + "\n";
687 }
688
689 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
690 OutStreamer->emitBytes(StringRef(Comment));
691 }
692 }
693
694 return false;
695}
696
697// TODO: Fold this into emitFunctionBodyStart.
698void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
699 // In the beginning all features are either 'Any' or 'NotSupported',
700 // depending on global target features. This will cover empty modules.
702 getGlobalSTI()->getFeatureString());
703
704 // If module is empty, we are done.
705 if (M.empty())
706 return;
707
708 // If module is not empty, need to find first 'Off' or 'On' feature
709 // setting per feature from functions in module.
710 for (auto &F : M) {
711 auto &TSTargetID = getTargetStreamer()->getTargetID();
712 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
713 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
714 break;
715
717 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
718 if (TSTargetID->isXnackSupported())
719 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
720 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
721 if (TSTargetID->isSramEccSupported())
722 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
723 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
724 }
725}
726
727uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
729 const SIInstrInfo *TII = STM.getInstrInfo();
730
731 uint64_t CodeSize = 0;
732
733 for (const MachineBasicBlock &MBB : MF) {
734 for (const MachineInstr &MI : MBB) {
735 // TODO: CodeSize should account for multiple functions.
736
737 // TODO: Should we count size of debug info?
738 if (MI.isDebugInstr())
739 continue;
740
741 CodeSize += TII->getInstSizeInBytes(MI);
742 }
743 }
744
745 return CodeSize;
746}
747
748void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
749 const MachineFunction &MF) {
751 ResourceUsage->getResourceInfo(&MF.getFunction());
753 MCContext &Ctx = MF.getContext();
754
755 auto CreateExpr = [&Ctx](int64_t Value) {
756 return MCConstantExpr::create(Value, Ctx);
757 };
758
759 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
760 int64_t Val;
761 if (Value->evaluateAsAbsolute(Val)) {
762 Res = Val;
763 return true;
764 }
765 return false;
766 };
767
768 ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
769 ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
770 ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
771 ProgInfo.AccumOffset =
772 CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
773 ProgInfo.TgSplit = STM.isTgSplitEnabled();
774 ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
775 ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
776 ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
777 ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
778 ProgInfo.DynamicCallStack =
779 CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
780
781 const uint64_t MaxScratchPerWorkitem =
783 uint64_t ScratchSize;
784 if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
785 ScratchSize > MaxScratchPerWorkitem) {
786 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
787 MaxScratchPerWorkitem, DS_Error);
788 MF.getFunction().getContext().diagnose(DiagStackSize);
789 }
790
792
793 // The calculations related to SGPR/VGPR blocks are
794 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
795 // unified.
797 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
798 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
799
800 // Check the addressable register limit before we add ExtraSGPRs.
802 !STM.hasSGPRInitBug()) {
803 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
804 uint64_t NumSgpr;
805 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
806 NumSgpr > MaxAddressableNumSGPRs) {
807 // This can happen due to a compiler bug or when using inline asm.
810 MF.getFunction(), "addressable scalar registers", NumSgpr,
811 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
812 Ctx.diagnose(Diag);
813 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
814 }
815 }
816
817 // Account for extra SGPRs and VGPRs reserved for debugger use.
818 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
819
820 const Function &F = MF.getFunction();
821
822 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
823 // dispatch registers are function args.
824 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
825
826 if (isShader(F.getCallingConv())) {
827 bool IsPixelShader =
828 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
829
830 // Calculate the number of VGPR registers based on the SPI input registers
831 uint32_t InputEna = 0;
832 uint32_t InputAddr = 0;
833 unsigned LastEna = 0;
834
835 if (IsPixelShader) {
836 // Note for IsPixelShader:
837 // By this stage, all enabled inputs are tagged in InputAddr as well.
838 // We will use InputAddr to determine whether the input counts against the
839 // vgpr total and only use the InputEnable to determine the last input
840 // that is relevant - if extra arguments are used, then we have to honour
841 // the InputAddr for any intermediate non-enabled inputs.
842 InputEna = MFI->getPSInputEnable();
843 InputAddr = MFI->getPSInputAddr();
844
845 // We only need to consider input args up to the last used arg.
846 assert((InputEna || InputAddr) &&
847 "PSInputAddr and PSInputEnable should "
848 "never both be 0 for AMDGPU_PS shaders");
849 // There are some rare circumstances where InputAddr is non-zero and
850 // InputEna can be set to 0. In this case we default to setting LastEna
851 // to 1.
852 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
853 }
854
855 // FIXME: We should be using the number of registers determined during
856 // calling convention lowering to legalize the types.
857 const DataLayout &DL = F.getParent()->getDataLayout();
858 unsigned PSArgCount = 0;
859 unsigned IntermediateVGPR = 0;
860 for (auto &Arg : F.args()) {
861 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
862 if (Arg.hasAttribute(Attribute::InReg)) {
863 WaveDispatchNumSGPR += NumRegs;
864 } else {
865 // If this is a PS shader and we're processing the PS Input args (first
866 // 16 VGPR), use the InputEna and InputAddr bits to define how many
867 // VGPRs are actually used.
868 // Any extra VGPR arguments are handled as normal arguments (and
869 // contribute to the VGPR count whether they're used or not).
870 if (IsPixelShader && PSArgCount < 16) {
871 if ((1 << PSArgCount) & InputAddr) {
872 if (PSArgCount < LastEna)
873 WaveDispatchNumVGPR += NumRegs;
874 else
875 IntermediateVGPR += NumRegs;
876 }
877 PSArgCount++;
878 } else {
879 // If there are extra arguments we have to include the allocation for
880 // the non-used (but enabled with InputAddr) input arguments
881 if (IntermediateVGPR) {
882 WaveDispatchNumVGPR += IntermediateVGPR;
883 IntermediateVGPR = 0;
884 }
885 WaveDispatchNumVGPR += NumRegs;
886 }
887 }
888 }
890 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
891
893 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
894
896 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
897 }
898
899 // Adjust number of registers used to meet default/requested minimum/maximum
900 // number of waves per execution unit request.
901 unsigned MaxWaves = MFI->getMaxWavesPerEU();
903 {ProgInfo.NumSGPR, CreateExpr(1ul),
904 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
905 Ctx);
907 {ProgInfo.NumVGPR, CreateExpr(1ul),
908 CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
909 Ctx);
910
912 STM.hasSGPRInitBug()) {
913 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
914 uint64_t NumSgpr;
915 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
916 NumSgpr > MaxAddressableNumSGPRs) {
917 // This can happen due to a compiler bug or when using inline asm to use
918 // the registers which are usually reserved for vcc etc.
920 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
921 NumSgpr, MaxAddressableNumSGPRs,
923 Ctx.diagnose(Diag);
924 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
925 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
926 }
927 }
928
929 if (STM.hasSGPRInitBug()) {
930 ProgInfo.NumSGPR =
932 ProgInfo.NumSGPRsForWavesPerEU =
934 }
935
936 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
938 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
939 MFI->getNumUserSGPRs(),
941 Ctx.diagnose(Diag);
942 }
943
944 if (MFI->getLDSSize() >
945 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
948 MF.getFunction(), "local memory", MFI->getLDSSize(),
950 Ctx.diagnose(Diag);
951 }
952 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
953 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
954 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
955 unsigned Granule) {
956 const MCExpr *OneConst = CreateExpr(1ul);
957 const MCExpr *GranuleConst = CreateExpr(Granule);
958 const MCExpr *MaxNumGPR =
959 AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
960 const MCExpr *AlignToGPR =
961 AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
962 const MCExpr *DivGPR =
963 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
964 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
965 return SubGPR;
966 };
967
968 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
970 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
972
973 const SIModeRegisterDefaults Mode = MFI->getMode();
974
975 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
976 // register.
977 ProgInfo.FloatMode = getFPMode(Mode);
978
979 ProgInfo.IEEEMode = Mode.IEEE;
980
981 // Make clamp modifier on NaN input returns 0.
982 ProgInfo.DX10Clamp = Mode.DX10Clamp;
983
984 unsigned LDSAlignShift;
986 // LDS is allocated in 64 dword blocks.
987 LDSAlignShift = 8;
988 } else {
989 // LDS is allocated in 128 dword blocks.
990 LDSAlignShift = 9;
991 }
992
993 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
994 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
995
996 ProgInfo.LDSSize = MFI->getLDSSize();
997 ProgInfo.LDSBlocks =
998 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
999
1000 // The MCExpr equivalent of divideCeil.
1001 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1002 const MCExpr *Ceil =
1003 AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1004 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1005 };
1006
1007 // Scratch is allocated in 64-dword or 256-dword blocks.
1008 unsigned ScratchAlignShift =
1009 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1010 // We need to program the hardware with the amount of scratch memory that
1011 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1012 // scratch memory used per thread.
1013 ProgInfo.ScratchBlocks = DivideCeil(
1015 CreateExpr(STM.getWavefrontSize()), Ctx),
1016 CreateExpr(1ULL << ScratchAlignShift));
1017
1018 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1019 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1020 ProgInfo.MemOrdered = 1;
1021 }
1022
1023 // 0 = X, 1 = XY, 2 = XYZ
1024 unsigned TIDIGCompCnt = 0;
1025 if (MFI->hasWorkItemIDZ())
1026 TIDIGCompCnt = 2;
1027 else if (MFI->hasWorkItemIDY())
1028 TIDIGCompCnt = 1;
1029
1030 // The private segment wave byte offset is the last of the system SGPRs. We
1031 // initially assumed it was allocated, and may have used it. It shouldn't harm
1032 // anything to disable it if we know the stack isn't used here. We may still
1033 // have emitted code reading it to initialize scratch, but if that's unused
1034 // reading garbage should be OK.
1037 MCConstantExpr::create(0, Ctx), Ctx),
1038 ProgInfo.DynamicCallStack, Ctx);
1039
1040 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1041 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1042 ProgInfo.TrapHandlerEnable =
1043 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1044 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1045 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1046 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1047 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1048 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1049 ProgInfo.EXCPEnMSB = 0;
1050 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1051 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1052 ProgInfo.EXCPEnable = 0;
1053
1054 if (STM.hasGFX90AInsts()) {
1055 // return ((Dst & ~Mask) | (Value << Shift))
1056 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1057 uint32_t Shift) {
1058 auto Shft = MCConstantExpr::create(Shift, Ctx);
1059 auto Msk = MCConstantExpr::create(Mask, Ctx);
1060 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1062 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1063 return Dst;
1064 };
1065
1066 ProgInfo.ComputePGMRSrc3GFX90A =
1067 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1068 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1069 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1070 ProgInfo.ComputePGMRSrc3GFX90A =
1071 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1072 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1073 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1074 }
1075
1077 STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
1078 ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1079
1080 const auto [MinWEU, MaxWEU] =
1081 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1082 uint64_t Occupancy;
1083 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1085 F, F.getSubprogram(),
1086 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1087 "'" +
1088 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1089 ", final occupancy is " + Twine(Occupancy));
1090 F.getContext().diagnose(Diag);
1091 }
1092}
1093
1094static unsigned getRsrcReg(CallingConv::ID CallConv) {
1095 switch (CallConv) {
1096 default: [[fallthrough]];
1104 }
1105}
1106
1107void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1108 const SIProgramInfo &CurrentProgramInfo) {
1110 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1111 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1112 MCContext &Ctx = MF.getContext();
1113
1114 // (((Value) & Mask) << Shift)
1115 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1116 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1117 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1119 shft, Ctx);
1120 };
1121
1122 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1123 int64_t Val;
1124 if (Value->evaluateAsAbsolute(Val))
1125 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1126 else
1127 OutStreamer->emitValue(Value, Size);
1128 };
1129
1132
1133 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1134 /*Size=*/4);
1135
1137 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1138
1140
1141 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1142 // appropriate generation.
1143 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1144 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1145 /*Mask=*/0x3FFFF, /*Shift=*/12),
1146 /*Size=*/4);
1147 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1148 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1149 /*Mask=*/0x7FFF, /*Shift=*/12),
1150 /*Size=*/4);
1151 } else {
1152 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1153 /*Mask=*/0x1FFF, /*Shift=*/12),
1154 /*Size=*/4);
1155 }
1156
1157 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1158 // 0" comment but I don't see a corresponding field in the register spec.
1159 } else {
1160 OutStreamer->emitInt32(RsrcReg);
1161
1162 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1163 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1164 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1165 MF.getContext());
1166 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1168
1169 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1170 // appropriate generation.
1171 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1172 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1173 /*Mask=*/0x3FFFF, /*Shift=*/12),
1174 /*Size=*/4);
1175 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1176 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1177 /*Mask=*/0x7FFF, /*Shift=*/12),
1178 /*Size=*/4);
1179 } else {
1180 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1181 /*Mask=*/0x1FFF, /*Shift=*/12),
1182 /*Size=*/4);
1183 }
1184 }
1185
1188 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1189 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1190 : CurrentProgramInfo.LDSBlocks;
1191 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1193 OutStreamer->emitInt32(MFI->getPSInputEnable());
1195 OutStreamer->emitInt32(MFI->getPSInputAddr());
1196 }
1197
1198 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1199 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1200 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1201 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1202}
1203
1204// Helper function to add common PAL Metadata 3.0+
1206 const SIProgramInfo &CurrentProgramInfo,
1207 CallingConv::ID CC, const GCNSubtarget &ST) {
1208 if (ST.hasIEEEMode())
1209 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1210
1211 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1212 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1213
1214 if (AMDGPU::isCompute(CC)) {
1215 MD->setHwStage(CC, ".trap_present",
1216 (bool)CurrentProgramInfo.TrapHandlerEnable);
1217 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1218 }
1219
1220 MD->setHwStage(CC, ".lds_size",
1221 (unsigned)(CurrentProgramInfo.LdsSize *
1222 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1223}
1224
1225// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1226// is AMDPAL. It stores each compute/SPI register setting and other PAL
1227// metadata items into the PALMD::Metadata, combining with any provided by the
1228// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1229// is then written as a single block in the .note section.
1230void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1231 const SIProgramInfo &CurrentProgramInfo) {
1233 auto CC = MF.getFunction().getCallingConv();
1234 auto MD = getTargetStreamer()->getPALMetadata();
1235 auto &Ctx = MF.getContext();
1236
1237 MD->setEntryPoint(CC, MF.getFunction().getName());
1238 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1239
1240 // Only set AGPRs for supported devices
1241 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1242 if (STM.hasMAIInsts()) {
1243 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1244 }
1245
1246 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1247 if (MD->getPALMajorVersion() < 3) {
1248 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1249 if (AMDGPU::isCompute(CC)) {
1250 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1251 } else {
1252 const MCExpr *HasScratchBlocks =
1253 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1254 MCConstantExpr::create(0, Ctx), Ctx);
1255 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1256 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1257 }
1258 } else {
1259 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1260 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1261 CurrentProgramInfo.ScratchEnable);
1262 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1263 }
1264
1265 // ScratchSize is in bytes, 16 aligned.
1266 MD->setScratchSize(
1267 CC,
1269 MCConstantExpr::create(16, Ctx), Ctx),
1270 Ctx);
1271
1273 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1274 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1275 : CurrentProgramInfo.LDSBlocks;
1276 if (MD->getPALMajorVersion() < 3) {
1277 MD->setRsrc2(
1278 CC,
1280 Ctx);
1281 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1282 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1283 } else {
1284 // Graphics registers
1285 const unsigned ExtraLdsDwGranularity =
1286 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1287 MD->setGraphicsRegisters(
1288 ".ps_extra_lds_size",
1289 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1290
1291 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1292 static StringLiteral const PsInputFields[] = {
1293 ".persp_sample_ena", ".persp_center_ena",
1294 ".persp_centroid_ena", ".persp_pull_model_ena",
1295 ".linear_sample_ena", ".linear_center_ena",
1296 ".linear_centroid_ena", ".line_stipple_tex_ena",
1297 ".pos_x_float_ena", ".pos_y_float_ena",
1298 ".pos_z_float_ena", ".pos_w_float_ena",
1299 ".front_face_ena", ".ancillary_ena",
1300 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1301 unsigned PSInputEna = MFI->getPSInputEnable();
1302 unsigned PSInputAddr = MFI->getPSInputAddr();
1303 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1304 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1305 (bool)((PSInputEna >> Idx) & 1));
1306 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1307 (bool)((PSInputAddr >> Idx) & 1));
1308 }
1309 }
1310 }
1311
1312 // For version 3 and above the wave front size is already set in the metadata
1313 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1314 MD->setWave32(MF.getFunction().getCallingConv());
1315}
1316
1317void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1318 auto *MD = getTargetStreamer()->getPALMetadata();
1319 const MachineFrameInfo &MFI = MF.getFrameInfo();
1320 StringRef FnName = MF.getFunction().getName();
1321 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1323 MCContext &Ctx = MF.getContext();
1324
1325 if (MD->getPALMajorVersion() < 3) {
1326 // Set compute registers
1327 MD->setRsrc1(
1329 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1330 MD->setRsrc2(CallingConv::AMDGPU_CS,
1331 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1332 } else {
1333 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1334 }
1335
1336 // Set optional info
1337 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1338 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1339 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1340}
1341
1342// This is supposed to be log2(Size)
1344 switch (Size) {
1345 case 4:
1346 return AMD_ELEMENT_4_BYTES;
1347 case 8:
1348 return AMD_ELEMENT_8_BYTES;
1349 case 16:
1350 return AMD_ELEMENT_16_BYTES;
1351 default:
1352 llvm_unreachable("invalid private_element_size");
1353 }
1354}
1355
1356void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1357 const SIProgramInfo &CurrentProgramInfo,
1358 const MachineFunction &MF) const {
1359 const Function &F = MF.getFunction();
1360 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1361 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1362
1364 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1365 MCContext &Ctx = MF.getContext();
1366
1367 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1368
1370 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1372 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1374
1375 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1376
1378 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1379
1380 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1381 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1383 }
1384
1385 if (UserSGPRInfo.hasDispatchPtr())
1387
1388 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1390
1391 if (UserSGPRInfo.hasKernargSegmentPtr())
1393
1394 if (UserSGPRInfo.hasDispatchID())
1396
1397 if (UserSGPRInfo.hasFlatScratchInit())
1399
1400 if (UserSGPRInfo.hasDispatchPtr())
1402
1403 if (STM.isXNACKEnabled())
1405
1406 Align MaxKernArgAlign;
1407 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1408 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1409 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1410 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1411 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1412
1413 // kernarg_segment_alignment is specified as log of the alignment.
1414 // The minimum alignment is 16.
1415 // FIXME: The metadata treats the minimum as 4?
1416 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1417}
1418
1420 const char *ExtraCode, raw_ostream &O) {
1421 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1422 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1423 return false;
1424
1425 if (ExtraCode && ExtraCode[0]) {
1426 if (ExtraCode[1] != 0)
1427 return true; // Unknown modifier.
1428
1429 switch (ExtraCode[0]) {
1430 case 'r':
1431 break;
1432 default:
1433 return true;
1434 }
1435 }
1436
1437 // TODO: Should be able to support other operand types like globals.
1438 const MachineOperand &MO = MI->getOperand(OpNo);
1439 if (MO.isReg()) {
1442 return false;
1443 } else if (MO.isImm()) {
1444 int64_t Val = MO.getImm();
1446 O << Val;
1447 } else if (isUInt<16>(Val)) {
1448 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1449 } else if (isUInt<32>(Val)) {
1450 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1451 } else {
1452 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1453 }
1454 return false;
1455 }
1456 return true;
1457}
1458
1463}
1464
1465void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1466 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1467 bool isModuleEntryFunction, bool hasMAIInsts) {
1468 if (!ORE)
1469 return;
1470
1471 const char *Name = "kernel-resource-usage";
1472 const char *Indent = " ";
1473
1474 // If the remark is not specifically enabled, do not output to yaml
1477 return;
1478
1479 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1480 StringRef RemarkLabel, auto Argument) {
1481 // Add an indent for every line besides the line with the kernel name. This
1482 // makes it easier to tell which resource usage go with which kernel since
1483 // the kernel name will always be displayed first.
1484 std::string LabelStr = RemarkLabel.str() + ": ";
1485 if (RemarkName != "FunctionName")
1486 LabelStr = Indent + LabelStr;
1487
1488 ORE->emit([&]() {
1489 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1491 &MF.front())
1492 << LabelStr << ore::NV(RemarkName, Argument);
1493 });
1494 };
1495
1496 // FIXME: Formatting here is pretty nasty because clang does not accept
1497 // newlines from diagnostics. This forces us to emit multiple diagnostic
1498 // remarks to simulate newlines. If and when clang does accept newlines, this
1499 // formatting should be aggregated into one remark with newlines to avoid
1500 // printing multiple diagnostic location and diag opts.
1501 EmitResourceUsageRemark("FunctionName", "Function Name",
1502 MF.getFunction().getName());
1503 EmitResourceUsageRemark("NumSGPR", "SGPRs",
1504 getMCExprStr(CurrentProgramInfo.NumSGPR));
1505 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1506 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1507 if (hasMAIInsts) {
1508 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1509 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1510 }
1511 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1512 getMCExprStr(CurrentProgramInfo.ScratchSize));
1513 int64_t DynStack;
1514 bool DynStackEvaluatable =
1515 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1516 StringRef DynamicStackStr =
1517 DynStackEvaluatable && DynStack ? "True" : "False";
1518 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1519 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1520 getMCExprStr(CurrentProgramInfo.Occupancy));
1521 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1522 CurrentProgramInfo.SGPRSpill);
1523 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1524 CurrentProgramInfo.VGPRSpill);
1525 if (isModuleEntryFunction)
1526 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1527 CurrentProgramInfo.LDSSize);
1528}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1046
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1184
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1166
#define C_00B84C_SCRATCH_EN
Definition: SIDefines.h:1082
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1158
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1117
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1179
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1069
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1068
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1077
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1116
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1055
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1177
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1119
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1198
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1165
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1176
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1060
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1199
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1054
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1079
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1053
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI, bool TrapEnabled)
static const AMDGPUVariadicMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:70
static const AMDGPUVariadicMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:85
static const AMDGPUVariadicMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUVariadicMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUVariadicMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:85
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:399
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:704
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:726
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:88
const MCAsmInfo * MAI
Target Asm Printer information.
Definition: AsmPrinter.h:91
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:103
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:450
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:659
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:441
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:395
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:115
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:95
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:100
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:266
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:699
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1830
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:809
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:257
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:600
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:604
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:293
bool dumpCode() const
Definition: GCNSubtarget.h:504
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:592
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:936
Generation getGeneration() const
Definition: GCNSubtarget.h:308
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:312
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:80
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:247
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:286
unsigned getAddressSpace() const
Definition: GlobalValue.h:204
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:295
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:326
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:536
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:601
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:571
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:591
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:556
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:546
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:606
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Context object for machine code objects.
Definition: MCContext.h:82
const MCObjectFileInfo * getObjectFileInfo() const
Definition: MCContext.h:419
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1093
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:35
MCSection * getReadOnlySection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition: MCSection.h:156
MCContext & getContext() const
Definition: MCStreamer.h:297
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:466
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
Represents a location in source code.
Definition: SMLoc.h:23
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:846
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:223
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
Definition: AsmWriter.cpp:5022
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1067
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1340
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1849
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:31
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:70
const MCExpr * ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:63
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:66
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:33
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:48
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:90
uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:74
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:53
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:51
uint64_t getComputePGMRSrc2() const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:68
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:67
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:87
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:34
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:80
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:65
const MCExpr * Occupancy
Definition: SIProgramInfo.h:83
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:44
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:77
void reset(const MachineFunction &MF)
uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.