LLVM 18.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
26#include "R600AsmPrinter.h"
35#include "llvm/MC/MCAssembler.h"
36#include "llvm/MC/MCContext.h"
38#include "llvm/MC/MCStreamer.h"
44
45using namespace llvm;
46using namespace llvm::AMDGPU;
47
48// This should get the default rounding mode from the kernel. We just set the
49// default here, but this could change if the OpenCL rounding mode pragmas are
50// used.
51//
52// The denormal mode here should match what is reported by the OpenCL runtime
53// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
54// can also be override to flush with the -cl-denorms-are-zero compiler flag.
55//
56// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
57// precision, and leaves single precision to flush all and does not report
58// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
59// CL_FP_DENORM for both.
60//
61// FIXME: It seems some instructions do not support single precision denormals
62// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
63// and sin_f32, cos_f32 on most parts).
64
65// We want to use these instructions, and using fp32 denormals also causes
66// instructions to run at the double precision rate for the device so it's
67// probably best to just report no single precision denormals.
71 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
72 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
73}
74
75static AsmPrinter *
77 std::unique_ptr<MCStreamer> &&Streamer) {
78 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
79}
80
86}
87
89 std::unique_ptr<MCStreamer> Streamer)
90 : AsmPrinter(TM, std::move(Streamer)) {
91 assert(OutStreamer && "AsmPrinter constructed without streamer");
92}
93
95 return "AMDGPU Assembly Printer";
96}
97
99 return TM.getMCSubtargetInfo();
100}
101
103 if (!OutStreamer)
104 return nullptr;
105 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
106}
107
110}
111
112void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
114
115 // TODO: Which one is called first, emitStartOfAsmFile or
116 // emitFunctionBodyStart?
117 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
118 initializeTargetID(M);
119
122 return;
123
125
127 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
128
131}
132
134 // Init target streamer if it has not yet happened
136 initTargetStreamer(M);
137
140
141 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
142 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
144 HSAMetadataStream->end();
145 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
146 (void)Success;
147 assert(Success && "Malformed HSA Metadata");
148 }
149}
150
152 const MachineBasicBlock *MBB) const {
154 return false;
155
156 if (MBB->empty())
157 return true;
158
159 // If this is a block implementing a long branch, an expression relative to
160 // the start of the block is needed. to the start of the block.
161 // XXX - Is there a smarter way to check this?
162 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
163}
164
167 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
168 const Function &F = MF->getFunction();
169
170 // TODO: Which one is called first, emitStartOfAsmFile or
171 // emitFunctionBodyStart?
173 initializeTargetID(*F.getParent());
174
175 const auto &FunctionTargetID = STM.getTargetID();
176 // Make sure function's xnack settings are compatible with module's
177 // xnack settings.
178 if (FunctionTargetID.isXnackSupported() &&
179 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
180 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
181 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
182 "' function does not match module xnack setting");
183 return;
184 }
185 // Make sure function's sramecc settings are compatible with module's
186 // sramecc settings.
187 if (FunctionTargetID.isSramEccSupported() &&
188 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
189 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
190 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
191 "' function does not match module sramecc setting");
192 return;
193 }
194
195 if (!MFI.isEntryFunction())
196 return;
197
198 if (STM.isMesaKernel(F) &&
199 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
200 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
201 amd_kernel_code_t KernelCode;
202 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
204 }
205
206 if (STM.isAmdHsaOS())
207 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
208
209 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
212 }
213}
214
217 if (!MFI.isEntryFunction())
218 return;
219
221 return;
222
223 auto &Streamer = getTargetStreamer()->getStreamer();
224 auto &Context = Streamer.getContext();
225 auto &ObjectFileInfo = *Context.getObjectFileInfo();
226 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
227
228 Streamer.pushSection();
229 Streamer.switchSection(&ReadOnlySection);
230
231 // CP microcode requires the kernel descriptor to be allocated on 64 byte
232 // alignment.
233 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
234 ReadOnlySection.ensureMinAlignment(Align(64));
235
236 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
237
238 SmallString<128> KernelName;
239 getNameWithPrefix(KernelName, &MF->getFunction());
241 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
242 CurrentProgramInfo.NumVGPRsForWavesPerEU,
243 CurrentProgramInfo.NumSGPRsForWavesPerEU -
245 &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
247 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
248 CodeObjectVersion);
249
250 Streamer.popSection();
251}
252
254 Register RegNo = MI->getOperand(0).getReg();
255
258 OS << "implicit-def: "
259 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(OS.str());
265 OutStreamer->addBlankLine();
266}
267
271 return;
272 }
273
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(SymbolName, &MF->getFunction()),
280 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
286 HexLines.push_back("");
287 }
288
290}
291
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
299 HexLines.push_back("");
300 }
302}
303
306 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
308 Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getParent()->getDataLayout();
326 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
327 Align Alignment = GV->getAlign().value_or(Align(4));
328
329 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto TS = getTargetStreamer();
332 TS->emitAMDGPULDS(GVSym, Size, Alignment);
333 return;
334 }
335
337}
338
340 CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
341
343 switch (CodeObjectVersion) {
345 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
346 break;
348 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
349 break;
351 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
352 break;
353 default:
354 report_fatal_error("Unexpected code object version");
355 }
356 }
358}
359
361 // Pad with s_code_end to help tools and guard against instruction prefetch
362 // causing stale data in caches. Arguably this should be done by the linker,
363 // which is why this isn't done for Mesa.
364 const MCSubtargetInfo &STI = *getGlobalSTI();
365 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
368 OutStreamer->switchSection(getObjFileLowering().getTextSection());
370 }
371
373}
374
375// Print comments that apply to both callable functions and entry points.
376void AMDGPUAsmPrinter::emitCommonFunctionComments(
377 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
378 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
379 const AMDGPUMachineFunction *MFI) {
380 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
381 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
382 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
383 if (NumAGPR) {
384 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
385 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
386 false);
387 }
388 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
389 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
390 false);
391}
392
393uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
394 const MachineFunction &MF) const {
396 uint16_t KernelCodeProperties = 0;
397 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
398
399 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
400 KernelCodeProperties |=
401 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
402 }
403 if (UserSGPRInfo.hasDispatchPtr()) {
404 KernelCodeProperties |=
405 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
406 }
407 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
408 KernelCodeProperties |=
409 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
410 }
411 if (UserSGPRInfo.hasKernargSegmentPtr()) {
412 KernelCodeProperties |=
413 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
414 }
415 if (UserSGPRInfo.hasDispatchID()) {
416 KernelCodeProperties |=
417 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
418 }
419 if (UserSGPRInfo.hasFlatScratchInit()) {
420 KernelCodeProperties |=
421 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
422 }
424 KernelCodeProperties |=
425 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
426 }
427
428 if (CurrentProgramInfo.DynamicCallStack &&
429 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
430 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
431
432 return KernelCodeProperties;
433}
434
435amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
436 const MachineFunction &MF,
437 const SIProgramInfo &PI) const {
439 const Function &F = MF.getFunction();
441
442 amdhsa::kernel_descriptor_t KernelDescriptor;
443 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
444
445 assert(isUInt<32>(PI.ScratchSize));
446 assert(isUInt<32>(PI.getComputePGMRSrc1()));
447 assert(isUInt<32>(PI.getComputePGMRSrc2()));
448
449 KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
450 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
451
452 Align MaxKernArgAlign;
453 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
454
455 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
456 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
457 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
458
459 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
460 if (STM.hasGFX90AInsts())
461 KernelDescriptor.compute_pgm_rsrc3 =
462 CurrentProgramInfo.ComputePGMRSrc3GFX90A;
463
465 KernelDescriptor.kernarg_preload =
466 static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
467
468 return KernelDescriptor;
469}
470
472 // Init target streamer lazily on the first function so that previous passes
473 // can set metadata.
475 initTargetStreamer(*MF.getFunction().getParent());
476
477 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
478 CurrentProgramInfo = SIProgramInfo();
479
481
482 // The starting address of all shader programs must be 256 bytes aligned.
483 // Regular functions just need the basic required instruction alignment.
484 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
485
487
490 // FIXME: This should be an explicit check for Mesa.
491 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
492 MCSectionELF *ConfigSection =
493 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
494 OutStreamer->switchSection(ConfigSection);
495 }
496
497 if (MFI->isModuleEntryFunction()) {
498 getSIProgramInfo(CurrentProgramInfo, MF);
499 }
500
501 if (STM.isAmdPalOS()) {
502 if (MFI->isEntryFunction())
503 EmitPALMetadata(MF, CurrentProgramInfo);
504 else if (MFI->isModuleEntryFunction())
505 emitPALFunctionMetadata(MF);
506 } else if (!STM.isAmdHsaOS()) {
507 EmitProgramInfoSI(MF, CurrentProgramInfo);
508 }
509
510 DumpCodeInstEmitter = nullptr;
511 if (STM.dumpCode()) {
512 // For -dumpcode, get the assembler out of the streamer, even if it does
513 // not really want to let us have it. This only works with -filetype=obj.
514 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
515 OutStreamer->setUseAssemblerInfoForParsing(true);
516 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
517 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
518 if (Assembler)
519 DumpCodeInstEmitter = Assembler->getEmitterPtr();
520 }
521
522 DisasmLines.clear();
523 HexLines.clear();
525
527
528 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
529 STM.hasMAIInsts());
530
531 if (isVerbose()) {
532 MCSectionELF *CommentSection =
533 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
534 OutStreamer->switchSection(CommentSection);
535
536 if (!MFI->isEntryFunction()) {
537 OutStreamer->emitRawComment(" Function info:", false);
539 ResourceUsage->getResourceInfo(&MF.getFunction());
540 emitCommonFunctionComments(
541 Info.NumVGPR,
542 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
543 Info.getTotalNumVGPRs(STM),
544 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
545 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
546 return false;
547 }
548
549 OutStreamer->emitRawComment(" Kernel info:", false);
550 emitCommonFunctionComments(
551 CurrentProgramInfo.NumArchVGPR,
552 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
553 : std::optional<uint32_t>(),
554 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
555 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
556
557 OutStreamer->emitRawComment(
558 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
559 OutStreamer->emitRawComment(
560 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
561 OutStreamer->emitRawComment(
562 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
563 " bytes/workgroup (compile time only)", false);
564
565 OutStreamer->emitRawComment(
566 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
567 OutStreamer->emitRawComment(
568 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
569
570 OutStreamer->emitRawComment(
571 " NumSGPRsForWavesPerEU: " +
572 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
573 OutStreamer->emitRawComment(
574 " NumVGPRsForWavesPerEU: " +
575 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
576
577 if (STM.hasGFX90AInsts())
578 OutStreamer->emitRawComment(
579 " AccumOffset: " +
580 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
581
582 OutStreamer->emitRawComment(
583 " Occupancy: " +
584 Twine(CurrentProgramInfo.Occupancy), false);
585
586 OutStreamer->emitRawComment(
587 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
588
589 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
590 Twine(CurrentProgramInfo.ScratchEnable),
591 false);
592 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
593 Twine(CurrentProgramInfo.UserSGPR),
594 false);
595 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
596 Twine(CurrentProgramInfo.TrapHandlerEnable),
597 false);
598 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
599 Twine(CurrentProgramInfo.TGIdXEnable),
600 false);
601 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
602 Twine(CurrentProgramInfo.TGIdYEnable),
603 false);
604 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
605 Twine(CurrentProgramInfo.TGIdZEnable),
606 false);
607 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
608 Twine(CurrentProgramInfo.TIdIGCompCount),
609 false);
610
611 assert(STM.hasGFX90AInsts() ||
612 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
613 if (STM.hasGFX90AInsts()) {
614 OutStreamer->emitRawComment(
615 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
616 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
617 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
618 false);
619 OutStreamer->emitRawComment(
620 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
621 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
622 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
623 false);
624 }
625 }
626
627 if (DumpCodeInstEmitter) {
628
629 OutStreamer->switchSection(
630 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
631
632 for (size_t i = 0; i < DisasmLines.size(); ++i) {
633 std::string Comment = "\n";
634 if (!HexLines[i].empty()) {
635 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
636 Comment += " ; " + HexLines[i] + "\n";
637 }
638
639 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
640 OutStreamer->emitBytes(StringRef(Comment));
641 }
642 }
643
644 return false;
645}
646
647// TODO: Fold this into emitFunctionBodyStart.
648void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
649 // In the beginning all features are either 'Any' or 'NotSupported',
650 // depending on global target features. This will cover empty modules.
652 *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion);
653
654 // If module is empty, we are done.
655 if (M.empty())
656 return;
657
658 // If module is not empty, need to find first 'Off' or 'On' feature
659 // setting per feature from functions in module.
660 for (auto &F : M) {
661 auto &TSTargetID = getTargetStreamer()->getTargetID();
662 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
663 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
664 break;
665
667 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
668 if (TSTargetID->isXnackSupported())
669 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
670 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
671 if (TSTargetID->isSramEccSupported())
672 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
673 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
674 }
675}
676
677uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
679 const SIInstrInfo *TII = STM.getInstrInfo();
680
681 uint64_t CodeSize = 0;
682
683 for (const MachineBasicBlock &MBB : MF) {
684 for (const MachineInstr &MI : MBB) {
685 // TODO: CodeSize should account for multiple functions.
686
687 // TODO: Should we count size of debug info?
688 if (MI.isDebugInstr())
689 continue;
690
691 CodeSize += TII->getInstSizeInBytes(MI);
692 }
693 }
694
695 return CodeSize;
696}
697
698void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
699 const MachineFunction &MF) {
701 ResourceUsage->getResourceInfo(&MF.getFunction());
703
704 ProgInfo.NumArchVGPR = Info.NumVGPR;
705 ProgInfo.NumAccVGPR = Info.NumAGPR;
706 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
707 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
708 ProgInfo.TgSplit = STM.isTgSplitEnabled();
709 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
710 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
711 ProgInfo.VCCUsed = Info.UsesVCC;
712 ProgInfo.FlatUsed = Info.UsesFlatScratch;
713 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
714
715 const uint64_t MaxScratchPerWorkitem =
717 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
718 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
719 ProgInfo.ScratchSize,
720 MaxScratchPerWorkitem, DS_Error);
721 MF.getFunction().getContext().diagnose(DiagStackSize);
722 }
723
725
726 // The calculations related to SGPR/VGPR blocks are
727 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
728 // unified.
729 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
730 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
731 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
732
733 // Check the addressable register limit before we add ExtraSGPRs.
735 !STM.hasSGPRInitBug()) {
736 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
737 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
738 // This can happen due to a compiler bug or when using inline asm.
741 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
742 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
743 Ctx.diagnose(Diag);
744 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
745 }
746 }
747
748 // Account for extra SGPRs and VGPRs reserved for debugger use.
749 ProgInfo.NumSGPR += ExtraSGPRs;
750
751 const Function &F = MF.getFunction();
752
753 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
754 // dispatch registers are function args.
755 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
756
757 if (isShader(F.getCallingConv())) {
758 bool IsPixelShader =
759 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
760
761 // Calculate the number of VGPR registers based on the SPI input registers
762 uint32_t InputEna = 0;
763 uint32_t InputAddr = 0;
764 unsigned LastEna = 0;
765
766 if (IsPixelShader) {
767 // Note for IsPixelShader:
768 // By this stage, all enabled inputs are tagged in InputAddr as well.
769 // We will use InputAddr to determine whether the input counts against the
770 // vgpr total and only use the InputEnable to determine the last input
771 // that is relevant - if extra arguments are used, then we have to honour
772 // the InputAddr for any intermediate non-enabled inputs.
773 InputEna = MFI->getPSInputEnable();
774 InputAddr = MFI->getPSInputAddr();
775
776 // We only need to consider input args up to the last used arg.
777 assert((InputEna || InputAddr) &&
778 "PSInputAddr and PSInputEnable should "
779 "never both be 0 for AMDGPU_PS shaders");
780 // There are some rare circumstances where InputAddr is non-zero and
781 // InputEna can be set to 0. In this case we default to setting LastEna
782 // to 1.
783 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
784 }
785
786 // FIXME: We should be using the number of registers determined during
787 // calling convention lowering to legalize the types.
788 const DataLayout &DL = F.getParent()->getDataLayout();
789 unsigned PSArgCount = 0;
790 unsigned IntermediateVGPR = 0;
791 for (auto &Arg : F.args()) {
792 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
793 if (Arg.hasAttribute(Attribute::InReg)) {
794 WaveDispatchNumSGPR += NumRegs;
795 } else {
796 // If this is a PS shader and we're processing the PS Input args (first
797 // 16 VGPR), use the InputEna and InputAddr bits to define how many
798 // VGPRs are actually used.
799 // Any extra VGPR arguments are handled as normal arguments (and
800 // contribute to the VGPR count whether they're used or not).
801 if (IsPixelShader && PSArgCount < 16) {
802 if ((1 << PSArgCount) & InputAddr) {
803 if (PSArgCount < LastEna)
804 WaveDispatchNumVGPR += NumRegs;
805 else
806 IntermediateVGPR += NumRegs;
807 }
808 PSArgCount++;
809 } else {
810 // If there are extra arguments we have to include the allocation for
811 // the non-used (but enabled with InputAddr) input arguments
812 if (IntermediateVGPR) {
813 WaveDispatchNumVGPR += IntermediateVGPR;
814 IntermediateVGPR = 0;
815 }
816 WaveDispatchNumVGPR += NumRegs;
817 }
818 }
819 }
820 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
821 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
822 ProgInfo.NumVGPR =
823 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
824 }
825
826 // Adjust number of registers used to meet default/requested minimum/maximum
827 // number of waves per execution unit request.
828 ProgInfo.NumSGPRsForWavesPerEU = std::max(
829 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
830 ProgInfo.NumVGPRsForWavesPerEU = std::max(
831 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
832
834 STM.hasSGPRInitBug()) {
835 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
836 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
837 // This can happen due to a compiler bug or when using inline asm to use
838 // the registers which are usually reserved for vcc etc.
840 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
841 ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
843 Ctx.diagnose(Diag);
844 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
845 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
846 }
847 }
848
849 if (STM.hasSGPRInitBug()) {
850 ProgInfo.NumSGPR =
852 ProgInfo.NumSGPRsForWavesPerEU =
854 }
855
856 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
858 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
859 MFI->getNumUserSGPRs(),
861 Ctx.diagnose(Diag);
862 }
863
864 if (MFI->getLDSSize() >
865 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
868 MF.getFunction(), "local memory", MFI->getLDSSize(),
870 Ctx.diagnose(Diag);
871 }
872
874 &STM, ProgInfo.NumSGPRsForWavesPerEU);
876 &STM, ProgInfo.NumVGPRsForWavesPerEU);
877
878 const SIModeRegisterDefaults Mode = MFI->getMode();
879
880 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
881 // register.
882 ProgInfo.FloatMode = getFPMode(Mode);
883
884 ProgInfo.IEEEMode = Mode.IEEE;
885
886 // Make clamp modifier on NaN input returns 0.
887 ProgInfo.DX10Clamp = Mode.DX10Clamp;
888
889 unsigned LDSAlignShift;
891 // LDS is allocated in 64 dword blocks.
892 LDSAlignShift = 8;
893 } else {
894 // LDS is allocated in 128 dword blocks.
895 LDSAlignShift = 9;
896 }
897
898 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
899 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
900
901 ProgInfo.LDSSize = MFI->getLDSSize();
902 ProgInfo.LDSBlocks =
903 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
904
905 // Scratch is allocated in 64-dword or 256-dword blocks.
906 unsigned ScratchAlignShift =
907 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
908 // We need to program the hardware with the amount of scratch memory that
909 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
910 // scratch memory used per thread.
911 ProgInfo.ScratchBlocks = divideCeil(
912 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
913
914 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
915 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
916 ProgInfo.MemOrdered = 1;
917 }
918
919 // 0 = X, 1 = XY, 2 = XYZ
920 unsigned TIDIGCompCnt = 0;
921 if (MFI->hasWorkItemIDZ())
922 TIDIGCompCnt = 2;
923 else if (MFI->hasWorkItemIDY())
924 TIDIGCompCnt = 1;
925
926 // The private segment wave byte offset is the last of the system SGPRs. We
927 // initially assumed it was allocated, and may have used it. It shouldn't harm
928 // anything to disable it if we know the stack isn't used here. We may still
929 // have emitted code reading it to initialize scratch, but if that's unused
930 // reading garbage should be OK.
931 ProgInfo.ScratchEnable =
932 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
933 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
934 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
935 ProgInfo.TrapHandlerEnable =
936 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
937 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
938 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
939 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
940 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
941 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
942 ProgInfo.EXCPEnMSB = 0;
943 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
944 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
945 ProgInfo.EXCPEnable = 0;
946
947 if (STM.hasGFX90AInsts()) {
949 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
950 ProgInfo.AccumOffset);
952 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
953 ProgInfo.TgSplit);
954 }
955
956 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
957 ProgInfo.NumSGPRsForWavesPerEU,
958 ProgInfo.NumVGPRsForWavesPerEU);
959}
960
961static unsigned getRsrcReg(CallingConv::ID CallConv) {
962 switch (CallConv) {
963 default: [[fallthrough]];
971 }
972}
973
974void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
975 const SIProgramInfo &CurrentProgramInfo) {
978 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
979
982
983 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
984
986 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
987
989 OutStreamer->emitInt32(
990 STM.getGeneration() >= AMDGPUSubtarget::GFX11
991 ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
992 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
993
994 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
995 // 0" comment but I don't see a corresponding field in the register spec.
996 } else {
997 OutStreamer->emitInt32(RsrcReg);
998 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
999 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1001 OutStreamer->emitInt32(
1002 STM.getGeneration() >= AMDGPUSubtarget::GFX11
1003 ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
1004 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1005 }
1006
1009 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1010 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1011 : CurrentProgramInfo.LDSBlocks;
1012 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1014 OutStreamer->emitInt32(MFI->getPSInputEnable());
1016 OutStreamer->emitInt32(MFI->getPSInputAddr());
1017 }
1018
1019 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1020 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1021 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1022 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1023}
1024
1025// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1026// is AMDPAL. It stores each compute/SPI register setting and other PAL
1027// metadata items into the PALMD::Metadata, combining with any provided by the
1028// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1029// is then written as a single block in the .note section.
1030void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1031 const SIProgramInfo &CurrentProgramInfo) {
1033 auto CC = MF.getFunction().getCallingConv();
1034 auto MD = getTargetStreamer()->getPALMetadata();
1035
1037 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1038
1039 // Only set AGPRs for supported devices
1040 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1041 if (STM.hasMAIInsts()) {
1042 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1043 }
1044
1045 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1046 if (MD->getPALMajorVersion() < 3) {
1047 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1048 if (AMDGPU::isCompute(CC)) {
1049 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1050 } else {
1051 if (CurrentProgramInfo.ScratchBlocks > 0)
1052 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1053 }
1054 } else {
1055 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1056 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1057 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1058 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1059
1060 if (AMDGPU::isCompute(CC)) {
1061 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1062 MD->setHwStage(CC, ".trap_present",
1063 (bool)CurrentProgramInfo.TrapHandlerEnable);
1064
1065 // EXCPEnMSB?
1066 const unsigned LdsDwGranularity = 128;
1067 MD->setHwStage(CC, ".lds_size",
1068 (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
1069 sizeof(uint32_t)));
1070 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1071 } else {
1072 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1073 }
1074 }
1075
1076 // ScratchSize is in bytes, 16 aligned.
1077 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1079 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1080 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1081 : CurrentProgramInfo.LDSBlocks;
1082 if (MD->getPALMajorVersion() < 3) {
1083 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1084 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1085 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1086 } else {
1087 // Graphics registers
1088 const unsigned ExtraLdsDwGranularity =
1089 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1090 MD->setGraphicsRegisters(
1091 ".ps_extra_lds_size",
1092 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1093
1094 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1095 static StringLiteral const PsInputFields[] = {
1096 ".persp_sample_ena", ".persp_center_ena",
1097 ".persp_centroid_ena", ".persp_pull_model_ena",
1098 ".linear_sample_ena", ".linear_center_ena",
1099 ".linear_centroid_ena", ".line_stipple_tex_ena",
1100 ".pos_x_float_ena", ".pos_y_float_ena",
1101 ".pos_z_float_ena", ".pos_w_float_ena",
1102 ".front_face_ena", ".ancillary_ena",
1103 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1104 unsigned PSInputEna = MFI->getPSInputEnable();
1105 unsigned PSInputAddr = MFI->getPSInputAddr();
1106 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1107 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1108 (bool)((PSInputEna >> Idx) & 1));
1109 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1110 (bool)((PSInputAddr >> Idx) & 1));
1111 }
1112 }
1113 }
1114
1115 // For version 3 and above the wave front size is already set in the metadata
1116 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1117 MD->setWave32(MF.getFunction().getCallingConv());
1118}
1119
1120void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1121 auto *MD = getTargetStreamer()->getPALMetadata();
1122 const MachineFrameInfo &MFI = MF.getFrameInfo();
1123 StringRef FnName = MF.getFunction().getName();
1124 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1125
1126 // Set compute registers
1127 MD->setRsrc1(CallingConv::AMDGPU_CS,
1128 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1129 MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
1130
1131 // Set optional info
1132 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1133 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1134 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1135}
1136
1137// This is supposed to be log2(Size)
1139 switch (Size) {
1140 case 4:
1141 return AMD_ELEMENT_4_BYTES;
1142 case 8:
1143 return AMD_ELEMENT_8_BYTES;
1144 case 16:
1145 return AMD_ELEMENT_16_BYTES;
1146 default:
1147 llvm_unreachable("invalid private_element_size");
1148 }
1149}
1150
1151void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1152 const SIProgramInfo &CurrentProgramInfo,
1153 const MachineFunction &MF) const {
1154 const Function &F = MF.getFunction();
1155 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1156 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1157
1159 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1160
1162
1164 CurrentProgramInfo.getComputePGMRSrc1() |
1165 (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1167
1168 if (CurrentProgramInfo.DynamicCallStack)
1170
1173 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1174
1175 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1176 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1177 Out.code_properties |=
1179 }
1180
1181 if (UserSGPRInfo.hasDispatchPtr())
1183
1184 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1186
1187 if (UserSGPRInfo.hasKernargSegmentPtr())
1189
1190 if (UserSGPRInfo.hasDispatchID())
1192
1193 if (UserSGPRInfo.hasFlatScratchInit())
1195
1196 if (UserSGPRInfo.hasDispatchPtr())
1198
1199 if (STM.isXNACKEnabled())
1201
1202 Align MaxKernArgAlign;
1203 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1204 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1205 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1206 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1207 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1208
1209 // kernarg_segment_alignment is specified as log of the alignment.
1210 // The minimum alignment is 16.
1211 // FIXME: The metadata treats the minimum as 4?
1212 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1213}
1214
1216 const char *ExtraCode, raw_ostream &O) {
1217 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1218 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1219 return false;
1220
1221 if (ExtraCode && ExtraCode[0]) {
1222 if (ExtraCode[1] != 0)
1223 return true; // Unknown modifier.
1224
1225 switch (ExtraCode[0]) {
1226 case 'r':
1227 break;
1228 default:
1229 return true;
1230 }
1231 }
1232
1233 // TODO: Should be able to support other operand types like globals.
1234 const MachineOperand &MO = MI->getOperand(OpNo);
1235 if (MO.isReg()) {
1238 return false;
1239 } else if (MO.isImm()) {
1240 int64_t Val = MO.getImm();
1242 O << Val;
1243 } else if (isUInt<16>(Val)) {
1244 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1245 } else if (isUInt<32>(Val)) {
1246 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1247 } else {
1248 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1249 }
1250 return false;
1251 }
1252 return true;
1253}
1254
1259}
1260
1261void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1262 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1263 bool isModuleEntryFunction, bool hasMAIInsts) {
1264 if (!ORE)
1265 return;
1266
1267 const char *Name = "kernel-resource-usage";
1268 const char *Indent = " ";
1269
1270 // If the remark is not specifically enabled, do not output to yaml
1273 return;
1274
1275 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1276 StringRef RemarkLabel, auto Argument) {
1277 // Add an indent for every line besides the line with the kernel name. This
1278 // makes it easier to tell which resource usage go with which kernel since
1279 // the kernel name will always be displayed first.
1280 std::string LabelStr = RemarkLabel.str() + ": ";
1281 if (!RemarkName.equals("FunctionName"))
1282 LabelStr = Indent + LabelStr;
1283
1284 ORE->emit([&]() {
1285 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1287 &MF.front())
1288 << LabelStr << ore::NV(RemarkName, Argument);
1289 });
1290 };
1291
1292 // FIXME: Formatting here is pretty nasty because clang does not accept
1293 // newlines from diagnostics. This forces us to emit multiple diagnostic
1294 // remarks to simulate newlines. If and when clang does accept newlines, this
1295 // formatting should be aggregated into one remark with newlines to avoid
1296 // printing multiple diagnostic location and diag opts.
1297 EmitResourceUsageRemark("FunctionName", "Function Name",
1298 MF.getFunction().getName());
1299 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1300 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1301 if (hasMAIInsts)
1302 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1303 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1304 CurrentProgramInfo.ScratchSize);
1305 StringRef DynamicStackStr =
1306 CurrentProgramInfo.DynamicCallStack ? "True" : "False";
1307 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1308 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1309 CurrentProgramInfo.Occupancy);
1310 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1311 CurrentProgramInfo.SGPRSpill);
1312 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1313 CurrentProgramInfo.VGPRSpill);
1314 if (isModuleEntryFunction)
1315 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1316 CurrentProgramInfo.LDSSize);
1317}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
#define AMDHSA_BITS_GET(SRC, MSK)
#define AMDHSA_BITS_SET(DST, MSK, VAL)
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
@ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
LLVMContext & Context
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:966
#define S_0286E8_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1102
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1101
#define S_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:1000
#define S_0286E8_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1103
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1084
#define S_00B860_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1099
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1076
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1037
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1097
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:989
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:988
#define S_00B028_SGPRS(x)
Definition: SIDefines.h:968
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:997
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1036
#define S_00B860_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1098
#define S_00B028_VGPRS(x)
Definition: SIDefines.h:967
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:975
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1095
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1039
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1114
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1083
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1094
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:980
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1115
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:974
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:999
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:973
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const override
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setEntryPoint(unsigned CC, StringRef Name)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion)
virtual void EmitDirectiveAMDGCNTarget()
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI)
void initializeTargetID(const MCSubtargetInfo &STI, unsigned CodeObjectVersion)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:85
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:381
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:671
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:693
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:88
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:103
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:431
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:626
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:423
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:377
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:115
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:95
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:100
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:270
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:666
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1727
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:239
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:320
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:757
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:235
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:996
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:572
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:576
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:271
bool dumpCode() const
Definition: GCNSubtarget.h:476
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:564
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:863
Generation getGeneration() const
Definition: GCNSubtarget.h:286
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:290
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:79
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:244
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
unsigned getAddressSpace() const
Definition: GlobalValue.h:201
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Type * getValueType() const
Definition: GlobalValue.h:292
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:325
Context object for machine code objects.
Definition: MCContext.h:76
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1059
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:251
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:206
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:301
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:233
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:254
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:857
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:222
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:364
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:672
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:395
unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs)
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI)
unsigned getCodeObjectVersion(const Module &M)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:194
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:185
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:197
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:203
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:188
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:191
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:141
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:215
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:210
@ SHT_PROGBITS
Definition: ELF.h:1000
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1268
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1685
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:414
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2338
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1854
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
AMD Kernel Code Object (amd_kernel_code_t).
uint16_t workitem_vgpr_count
Number of vector registers used by each work-item.
uint32_t code_properties
Code properties.
uint8_t kernarg_segment_alignment
The maximum byte alignment of variables used by the kernel in the specified memory segment.
uint32_t workgroup_group_segment_byte_size
The amount of group segment memory required by a work-group in bytes.
uint16_t wavefront_sgpr_count
Number of scalar registers used by a wavefront.
uint32_t workitem_private_segment_byte_size
The amount of memory required for the combined private, spill and arg segments for a work-item in byt...
uint64_t kernarg_segment_byte_size
The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.
uint64_t compute_pgm_resource_registers
Shader program settings for CS.
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:25
uint64_t getPGMRSrc1(CallingConv::ID CC) const
uint32_t NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:70
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:46
uint64_t getComputePGMRSrc2() const
Compute the value of the ComputePGMRsrc2 register.
uint32_t NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:73
uint64_t ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:56
uint64_t getComputePGMRSrc1() const
Compute the value of the ComputePGMRsrc1 register.
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.