LLVM 23.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "AMDGPUTargetMachine.h"
24#include "GCNSubtarget.h"
29#include "R600AsmPrinter.h"
35#include "llvm/ADT/StringSet.h"
43#include "llvm/MC/MCAssembler.h"
44#include "llvm/MC/MCContext.h"
46#include "llvm/MC/MCStreamer.h"
47#include "llvm/MC/MCValue.h"
54
55using namespace llvm;
56using namespace llvm::AMDGPU;
57
58// This should get the default rounding mode from the kernel. We just set the
59// default here, but this could change if the OpenCL rounding mode pragmas are
60// used.
61//
62// The denormal mode here should match what is reported by the OpenCL runtime
63// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
64// can also be override to flush with the -cl-denorms-are-zero compiler flag.
65//
66// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
67// precision, and leaves single precision to flush all and does not report
68// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
69// CL_FP_DENORM for both.
70//
71// FIXME: It seems some instructions do not support single precision denormals
72// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
73// and sin_f32, cos_f32 on most parts).
74
75// We want to use these instructions, and using fp32 denormals also causes
76// instructions to run at the double precision rate for the device so it's
77// probably best to just report no single precision denormals.
84
85static AsmPrinter *
87 std::unique_ptr<MCStreamer> &&Streamer) {
88 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
89}
90
98
99namespace {
100class AMDGPUAsmPrinterHandler : public AsmPrinterHandler {
101protected:
102 AMDGPUAsmPrinter *Asm;
103
104public:
105 AMDGPUAsmPrinterHandler(AMDGPUAsmPrinter *A) : Asm(A) {}
106
107 void beginFunction(const MachineFunction *MF) override {}
108
109 void endFunction(const MachineFunction *MF) override { Asm->endFunction(MF); }
110
111 void endModule() override {}
112};
113} // End anonymous namespace
114
116 std::unique_ptr<MCStreamer> Streamer)
117 : AsmPrinter(TM, std::move(Streamer)) {
118 assert(OutStreamer && "AsmPrinter constructed without streamer");
119}
120
122 return "AMDGPU Assembly Printer";
123}
124
126 return &TM.getMCSubtargetInfo();
127}
128
130 if (!OutStreamer)
131 return nullptr;
132 return static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
133}
134
138
139void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
141
142 // TODO: Which one is called first, emitStartOfAsmFile or
143 // emitFunctionBodyStart?
144 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
145 initializeTargetID(M);
146
149 return;
150
152
155 CodeObjectVersion);
156 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
157 }
158
161}
162
164 // Init target streamer if it has not yet happened
166 initTargetStreamer(M);
167
168 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
170
171 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
172 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
173 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
174 HSAMetadataStream->end();
175 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
176 (void)Success;
177 assert(Success && "Malformed HSA Metadata");
178 }
179}
180
182 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
183 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
184 const Function &F = MF->getFunction();
185
186 // TODO: We're checking this late, would be nice to check it earlier.
187 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
189 STM.getCPU() + " is only available on code object version 6 or better");
190 }
191
192 // TODO: Which one is called first, emitStartOfAsmFile or
193 // emitFunctionBodyStart?
194 if (!getTargetStreamer()->getTargetID())
195 initializeTargetID(*F.getParent());
196
197 const auto &FunctionTargetID = STM.getTargetID();
198 // Make sure function's xnack settings are compatible with module's
199 // xnack settings.
200 if (FunctionTargetID.isXnackSupported() &&
201 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
202 FunctionTargetID.getXnackSetting() !=
203 getTargetStreamer()->getTargetID()->getXnackSetting()) {
204 OutContext.reportError(
205 {}, "xnack setting of '" + Twine(MF->getName()) +
206 "' function does not match module xnack setting");
207 return;
208 }
209 // Make sure function's sramecc settings are compatible with module's
210 // sramecc settings.
211 if (FunctionTargetID.isSramEccSupported() &&
212 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
213 FunctionTargetID.getSramEccSetting() !=
214 getTargetStreamer()->getTargetID()->getSramEccSetting()) {
215 OutContext.reportError(
216 {}, "sramecc setting of '" + Twine(MF->getName()) +
217 "' function does not match module sramecc setting");
218 return;
219 }
220
221 if (!MFI.isEntryFunction())
222 return;
223
224 if (STM.isMesaKernel(F) &&
225 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
226 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
227 AMDGPUMCKernelCodeT KernelCode;
228 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
229 KernelCode.validate(&STM, MF->getContext());
231 }
232
233 if (STM.isAmdHsaOS())
234 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
235}
236
237/// Set bits in a kernel descriptor MCExpr field:
238/// return ((Dst & ~Mask) | (Value << Shift))
239static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value,
240 uint32_t Mask, uint32_t Shift, MCContext &Ctx) {
241 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
242 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
243 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
245 Ctx);
246 return Dst;
247}
248
250 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
251 if (!MFI.isEntryFunction())
252 return;
253
254 assert(TM.getTargetTriple().getOS() == Triple::AMDHSA);
255
256 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
257 MCContext &Ctx = MF->getContext();
258
260 getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo);
261
262 // Compute inst_pref_size using MCExpr label subtraction for exact code
263 // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter)
264 // right after the function code, so (Lfunc_end - func_sym) gives the
265 // exact function code size in bytes.
266 if (STM.hasInstPrefSize()) {
267 const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub(
270
271 uint32_t Mask, Shift, Width, CacheLineSize;
272 STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize);
273 const MCExpr *InstPrefSize =
274 AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx);
276 setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx);
277 }
278
279 auto &Streamer = getTargetStreamer()->getStreamer();
280 auto &Context = Streamer.getContext();
281 auto &ObjectFileInfo = *Context.getObjectFileInfo();
282 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
283
284 Streamer.pushSection();
285 Streamer.switchSection(&ReadOnlySection);
286
287 // CP microcode requires the kernel descriptor to be allocated on 64 byte
288 // alignment.
289 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
290 ReadOnlySection.ensureMinAlignment(Align(64));
291
292 SmallString<128> KernelName;
293 getNameWithPrefix(KernelName, &MF->getFunction());
295 STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU,
297 CurrentProgramInfo.NumSGPRsForWavesPerEU,
299 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
300 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
301 Context),
302 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
303
304 Streamer.popSection();
305}
306
308 Register RegNo = MI->getOperand(0).getReg();
309
311 raw_svector_ostream OS(Str);
312 OS << "implicit-def: "
313 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
314
315 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
316 OS << " : SGPR spill to VGPR lane";
317
318 OutStreamer->AddComment(OS.str());
319 OutStreamer->addBlankLine();
320}
321
323 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
325 return;
326 }
327
328 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
329 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
330 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
331 SmallString<128> SymbolName;
332 getNameWithPrefix(SymbolName, &MF->getFunction()),
335 }
336 if (DumpCodeInstEmitter) {
337 // Disassemble function name label to text.
338 DisasmLines.push_back(MF->getName().str() + ":");
339 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
340 HexLines.emplace_back("");
341 }
342
344}
345
347 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
348 // Write a line for the basic block label if it is not only fallthrough.
349 DisasmLines.push_back((Twine("BB") + Twine(getFunctionNumber()) + "_" +
350 Twine(MBB.getNumber()) + ":")
351 .str());
352 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
353 HexLines.emplace_back("");
354 }
356}
357
360 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
361 OutContext.reportError({},
362 Twine(GV->getName()) +
363 ": unsupported initializer for address space");
364 return;
365 }
366
367 const Triple::OSType OS = TM.getTargetTriple().getOS();
368 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
370 return;
371 // With object linking, LDS definitions should have been externalized
372 // by earlier passes (e.g. LDS lowering, named barrier lowering).
373 // Only declarations reach here, emitted as SHN_AMDGPU_LDS symbols
374 // so the linker can assign their offsets.
375 assert(GV->isDeclaration() &&
376 "LDS definitions should have been externalized when object "
377 "linking is enabled");
378 }
379
380 MCSymbol *GVSym = getSymbol(GV);
381
382 GVSym->redefineIfPossible();
383 if (GVSym->isDefined() || GVSym->isVariable())
384 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
385 "' is already defined");
386
387 const DataLayout &DL = GV->getDataLayout();
389 Align Alignment = GV->getAlign().value_or(Align(4));
390
391 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
392 emitLinkage(GV, GVSym);
393 auto *TS = getTargetStreamer();
394 TS->emitAMDGPULDS(GVSym, Size, Alignment);
395 return;
396 }
397
399}
400
402 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
403
404 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
405 switch (CodeObjectVersion) {
407 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
408 break;
410 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
411 break;
413 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
414 break;
415 default:
416 reportFatalUsageError("unsupported code object version");
417 }
418
419 addAsmPrinterHandler(std::make_unique<AMDGPUAsmPrinterHandler>(this));
420 }
421
423}
424
425/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
426///
427/// Remove dependency on GCNSubtarget and depend only only the necessary values
428/// for said occupancy computation. Should match computeOccupancy implementation
429/// without passing \p STM on.
430const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
431 const MCExpr *NumVGPRs,
432 unsigned DynamicVGPRBlockSize,
433 const GCNSubtarget &STM, MCContext &Ctx) {
434 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(STM);
435 unsigned Granule = IsaInfo::getVGPRAllocGranule(STM, DynamicVGPRBlockSize);
436 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(STM);
437
438 // Bake the per-function SGPR budget into the operands so the late-evaluated
439 // MCExpr stays arithmetic. The trap reservation in particular is implicit on
440 // amdhsa and lives on STM, not on the assembler's MCSubtargetInfo.
441 unsigned SGPRTotal = IsaInfo::getTotalNumSGPRs(STM);
442 unsigned SGPRGranule = IsaInfo::getSGPRAllocGranule(STM);
443 unsigned SGPRTrapReserve = STM.hasTrapHandler() ? IsaInfo::TRAP_NUM_SGPRS : 0;
444
445 auto CreateExpr = [&Ctx](unsigned Value) {
446 return MCConstantExpr::create(Value, Ctx);
447 };
448
449 // Zero SGPR count when SGPRs don't limit occupancy, so the MCExpr skips the
450 // SGPR term without having to test the generation itself.
451 const MCExpr *SGPRArg =
452 IsaInfo::isSGPROccupancyLimited(STM) ? NumSGPRs : CreateExpr(0);
453
455 {CreateExpr(MaxWaves), CreateExpr(Granule),
456 CreateExpr(TargetTotalNumVGPRs),
457 CreateExpr(InitOcc), CreateExpr(SGPRTotal),
458 CreateExpr(SGPRGranule),
459 CreateExpr(SGPRTrapReserve), SGPRArg, NumVGPRs},
460 Ctx);
461}
462
463void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
464 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
465 return;
466
468 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
469 MCSymbol *FnSym = TM.getSymbol(&F);
470
471 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
472 int64_t Val;
473 if (Value->evaluateAsAbsolute(Val)) {
474 Res = Val;
475 return true;
476 }
477 return false;
478 };
479
480 const uint64_t MaxScratchPerWorkitem =
482 MCSymbol *ScratchSizeSymbol =
483 RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
484 uint64_t ScratchSize;
485 if (ScratchSizeSymbol->isVariable() &&
486 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
487 ScratchSize > MaxScratchPerWorkitem) {
488 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
489 DS_Error);
490 F.getContext().diagnose(DiagStackSize);
491 }
492
493 // Validate addressable scalar registers (i.e., prior to added implicit
494 // SGPRs).
495 MCSymbol *NumSGPRSymbol =
496 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
498 !STM.hasSGPRInitBug()) {
499 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
500 uint64_t NumSgpr;
501 if (NumSGPRSymbol->isVariable() &&
502 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
503 NumSgpr > MaxAddressableNumSGPRs) {
504 F.getContext().diagnose(DiagnosticInfoResourceLimit(
505 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
507 return;
508 }
509 }
510
511 MCSymbol *VCCUsedSymbol =
512 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
513 MCSymbol *FlatUsedSymbol =
514 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
515 uint64_t VCCUsed, FlatUsed, NumSgpr;
516
517 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
518 FlatUsedSymbol->isVariable() &&
519 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
520 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
521 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
522
523 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
524 // resolvable.
525 NumSgpr += IsaInfo::getNumExtraSGPRs(
526 STM, VCCUsed, FlatUsed,
527 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
529 STM.hasSGPRInitBug()) {
530 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
531 if (NumSgpr > MaxAddressableNumSGPRs) {
532 F.getContext().diagnose(DiagnosticInfoResourceLimit(
533 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
535 return;
536 }
537 }
538
539 MCSymbol *NumVgprSymbol =
540 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
541 MCSymbol *NumAgprSymbol =
542 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
543 uint64_t NumVgpr, NumAgpr;
544
545 MachineModuleInfo &MMI =
547 MachineFunction *MF = MMI.getMachineFunction(F);
548 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
549 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
550 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
551 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
552 unsigned MaxWaves = MFI.getMaxWavesPerEU();
553 uint64_t TotalNumVgpr =
554 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
555 uint64_t NumVGPRsForWavesPerEU =
556 std::max({TotalNumVgpr, (uint64_t)1,
557 (uint64_t)STM.getMinNumVGPRs(
558 MaxWaves, MFI.getDynamicVGPRBlockSize())});
559 uint64_t NumSGPRsForWavesPerEU = std::max(
560 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
561 const MCExpr *OccupancyExpr = createOccupancy(
562 STM.getOccupancyWithWorkGroupSizes(*MF).second,
563 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
564 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
566 uint64_t Occupancy;
567
568 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
569 F, "amdgpu-waves-per-eu", {0, 0}, true);
570
571 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
572 DiagnosticInfoOptimizationFailure Diag(
573 F, F.getSubprogram(),
574 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
575 "'" +
576 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
577 ", final occupancy is " + Twine(Occupancy));
578 F.getContext().diagnose(Diag);
579 return;
580 }
581 }
582 }
583}
584
585static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL,
586 bool IsReturnType) {
587 if (Ty->isVoidTy()) {
588 Enc += 'v';
589 return;
590 }
591 unsigned Bits = DL.getTypeSizeInBits(Ty);
592 // Zero-sized non-void types (e.g. `{}` or `[0 x i8]`) consume no ABI
593 // registers. For returns, emit the same no-result marker as void so the
594 // parameter encoding still has an explicit return-type prefix.
595 if (Bits == 0) {
596 if (IsReturnType)
597 Enc += 'v';
598 return;
599 }
600 if (Bits <= 32)
601 Enc += 'i';
602 else if (Bits <= 64)
603 Enc += 'l';
604 else
605 Enc.append(divideCeil(Bits, 32), 'i');
606}
607
608static std::string computeTypeId(const FunctionType *FTy,
609 const DataLayout &DL) {
610 std::string Enc;
611 appendTypeEncoding(Enc, FTy->getReturnType(), DL, /*IsReturnType=*/true);
612 for (Type *ParamTy : FTy->params())
613 appendTypeEncoding(Enc, ParamTy, DL, /*IsReturnType=*/false);
614 return Enc;
615}
616
617void AMDGPUAsmPrinter::collectCallEdge(const MachineInstr &MI) {
619 return;
620 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
621 const MachineOperand *Callee =
622 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
623 if (!Callee || !Callee->isGlobal())
624 return;
625 DirectCallEdges.insert(
626 {getSymbol(&MF->getFunction()), getSymbol(Callee->getGlobal())});
627}
628
629void AMDGPUAsmPrinter::emitAMDGPUInfo(Module &M) {
631 return;
632
633 const NamedMDNode *LDSMD = M.getNamedMetadata("amdgpu.lds.uses");
634 bool HasLDSUses = LDSMD && LDSMD->getNumOperands() > 0;
635
636 const NamedMDNode *BarMD = M.getNamedMetadata("amdgpu.named_barrier.uses");
637 bool HasNamedBarriers = BarMD && BarMD->getNumOperands() > 0;
638
639 // Collect address-taken functions (with type IDs) and indirect call sites.
640 DenseMap<const Function *, std::string> AddrTakenTypeIds;
641 using IndirectCallInfo = std::pair<const Function *, std::string>;
643
644 for (const Function &F : M) {
645 bool IsKernel = AMDGPU::isKernel(F.getCallingConv());
646
647 if (!IsKernel && F.hasAddressTaken(/*PutOffender=*/nullptr,
648 /*IgnoreCallbackUses=*/false,
649 /*IgnoreAssumeLikeCalls=*/true,
650 /*IgnoreLLVMUsed=*/true)) {
651 AddrTakenTypeIds[&F] =
652 computeTypeId(F.getFunctionType(), M.getDataLayout());
653 }
654
655 if (F.isDeclaration())
656 continue;
657
658 StringSet<> SeenTypeIds;
659 for (const BasicBlock &BB : F) {
660 for (const Instruction &I : BB) {
661 const auto *CB = dyn_cast<CallBase>(&I);
662 if (!CB || !CB->isIndirectCall())
663 continue;
664 std::string TId =
665 computeTypeId(CB->getFunctionType(), M.getDataLayout());
666 if (SeenTypeIds.insert(TId).second)
667 IndirectCalls.push_back({&F, std::move(TId)});
668 }
669 }
670 }
671
672 if (FunctionInfos.empty() && DirectCallEdges.empty() && !HasLDSUses &&
673 !HasNamedBarriers && AddrTakenTypeIds.empty() && IndirectCalls.empty())
674 return;
675
676 AMDGPU::InfoSectionData Data;
677 Data.Funcs = std::move(FunctionInfos);
678
679 for (auto &[F, TypeId] : AddrTakenTypeIds) {
680 MCSymbol *Sym = getSymbol(F);
681 Data.TypeIds.push_back({Sym, TypeId});
682 }
683
684 for (auto &[CallerSym, CalleeSym] : DirectCallEdges)
685 Data.Calls.push_back({CallerSym, CalleeSym});
686 DirectCallEdges.clear();
687
688 if (HasLDSUses) {
689 for (const MDNode *N : LDSMD->operands()) {
690 auto *Func = mdconst::extract<Function>(N->getOperand(0));
691 auto *LdsVar = mdconst::extract<GlobalVariable>(N->getOperand(1));
692 Data.Uses.push_back({getSymbol(Func), getSymbol(LdsVar)});
693 }
694 }
695
696 if (HasNamedBarriers) {
697 for (const MDNode *N : BarMD->operands()) {
698 auto *BarVar = mdconst::extract<GlobalVariable>(N->getOperand(0));
699 MCSymbol *BarSym = getSymbol(BarVar);
700 for (unsigned I = 1, E = N->getNumOperands(); I < E; ++I) {
701 auto *Func = mdconst::extract<Function>(N->getOperand(I));
702 Data.Uses.push_back({getSymbol(Func), BarSym});
703 }
704 }
705 }
706
707 for (auto &[Caller, Enc] : IndirectCalls) {
708 MCSymbol *CallerSym = getSymbol(Caller);
709 Data.IndirectCalls.push_back({CallerSym, Enc});
710 }
711
713}
714
716 // Pad with s_code_end to help tools and guard against instruction prefetch
717 // causing stale data in caches. Arguably this should be done by the linker,
718 // which is why this isn't done for Mesa.
719 // Don't do it if there is no code.
720 const MCSubtargetInfo &STI = *getGlobalSTI();
721 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
725 if (TextSect->hasInstructions()) {
726 OutStreamer->switchSection(TextSect);
728 }
729 }
730
731 // Emit the unified .amdgpu.info section (per-function resources, call graph,
732 // LDS/named-barrier use edges, indirect calls, and address-taken type IDs).
733 emitAMDGPUInfo(M);
734
735 // Assign expressions which can only be resolved when all other functions are
736 // known.
737 RI.finalize(OutContext);
738
739 // Switch section and emit all GPR maximums within the processed module.
740 OutStreamer->pushSection();
741 MCSectionELF *MaxGPRSection =
742 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
743 OutStreamer->switchSection(MaxGPRSection);
745 RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
746 RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
747 OutStreamer->popSection();
748
749 // In the object-linking pipeline per-function resource MCExprs reference
750 // external callee symbols that cannot be evaluated here, so cross-TU limit
751 // checks would silently no-op for every non-leaf function. Defer resource
752 // sanity checking to the linker, which re-validates against the aggregated
753 // call graph in the combined .amdgpu.info metadata.
755 for (Function &F : M.functions())
756 validateMCResourceInfo(F);
757 }
758
759 RI.reset();
760
762}
763
764SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
766 raw_svector_ostream OSS(Str);
767 auto &Streamer = getTargetStreamer()->getStreamer();
768 auto &Context = Streamer.getContext();
769 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
770 printAMDGPUMCExpr(New, OSS, &MAI);
771 return Str;
772}
773
774// Print comments that apply to both callable functions and entry points.
775void AMDGPUAsmPrinter::emitCommonFunctionComments(
776 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
777 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
778 const AMDGPUMachineFunctionInfo *MFI) {
779 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
780 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
781 false);
782 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
783 if (NumAGPR && TotalNumVGPR) {
784 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
785 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
786 false);
787 }
788 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
789 false);
790 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
791 false);
792}
793
794const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
795 const MachineFunction &MF) const {
796 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
797 MCContext &Ctx = MF.getContext();
798 uint16_t KernelCodeProperties = 0;
799 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
800
801 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
802 KernelCodeProperties |=
803 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
804 }
805 if (UserSGPRInfo.hasDispatchPtr()) {
806 KernelCodeProperties |=
807 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
808 }
809 if (UserSGPRInfo.hasQueuePtr()) {
810 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
811 }
812 if (UserSGPRInfo.hasKernargSegmentPtr()) {
813 KernelCodeProperties |=
814 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
815 }
816 if (UserSGPRInfo.hasDispatchID()) {
817 KernelCodeProperties |=
818 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
819 }
820 if (UserSGPRInfo.hasFlatScratchInit()) {
821 KernelCodeProperties |=
822 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
823 }
824 if (UserSGPRInfo.hasPrivateSegmentSize()) {
825 KernelCodeProperties |=
826 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
827 }
828 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
829 KernelCodeProperties |=
830 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
831 }
832
833 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
834 // un-evaluatable at this point so it cannot be conditionally checked here.
835 // Instead, we'll directly shift the possibly unknown MCExpr into its place
836 // and bitwise-or it into KernelCodeProperties.
837 const MCExpr *KernelCodePropExpr =
838 MCConstantExpr::create(KernelCodeProperties, Ctx);
839 const MCExpr *OrValue = MCConstantExpr::create(
840 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
841 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
842 OrValue, Ctx);
843 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
844
845 return KernelCodePropExpr;
846}
847
848MCKernelDescriptor
849AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
850 const SIProgramInfo &PI) const {
851 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
852 const Function &F = MF.getFunction();
853 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
854 MCContext &Ctx = MF.getContext();
855
856 MCKernelDescriptor KernelDescriptor;
857
858 KernelDescriptor.group_segment_fixed_size =
860 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
861
862 Align MaxKernArgAlign;
863 KernelDescriptor.kernarg_size = MCConstantExpr::create(
864 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
865
866 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
867 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(STM, Ctx);
868 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
869
870 int64_t PGM_Rsrc3 = 1;
871 bool EvaluatableRsrc3 =
872 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGM_Rsrc3);
873 (void)PGM_Rsrc3;
874 (void)EvaluatableRsrc3;
876 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
877 static_cast<uint64_t>(PGM_Rsrc3) == 0);
878 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
879
880 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
881 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
882 Ctx);
883
884 return KernelDescriptor;
885}
886
888 // Init target streamer lazily on the first function so that previous passes
889 // can set metadata.
891 initTargetStreamer(*MF.getFunction().getParent());
892
893 ResourceUsage =
895 CurrentProgramInfo.reset(MF);
896
897 const AMDGPUMachineFunctionInfo *MFI =
898 MF.getInfo<AMDGPUMachineFunctionInfo>();
899 MCContext &Ctx = MF.getContext();
900
901 // The starting address of all shader programs must be 256 bytes aligned.
902 // Regular functions just need the basic required instruction alignment.
903 MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
904
906
907 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
909 // FIXME: This should be an explicit check for Mesa.
910 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
911 MCSectionELF *ConfigSection =
912 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
913 OutStreamer->switchSection(ConfigSection);
914 }
915
916 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
917
920 *ResourceUsage;
921 FunctionInfos.push_back(
922 {/*NumSGPR=*/static_cast<uint32_t>(RU.NumExplicitSGPR),
923 /*NumArchVGPR=*/static_cast<uint32_t>(RU.NumVGPR),
924 /*NumAccVGPR=*/static_cast<uint32_t>(RU.NumAGPR),
925 /*PrivateSegmentSize=*/static_cast<uint32_t>(RU.PrivateSegmentSize),
926 /*UsesVCC=*/RU.UsesVCC,
927 /*UsesFlatScratch=*/RU.UsesFlatScratch,
928 /*HasDynStack=*/RU.HasDynamicallySizedStack,
929 /*Sym=*/getSymbol(&MF.getFunction())});
930 }
931
932 if (MFI->isModuleEntryFunction()) {
933 getSIProgramInfo(CurrentProgramInfo, MF);
934 }
935
936 if (STM.isAmdPalOS()) {
937 if (MFI->isEntryFunction())
938 EmitPALMetadata(MF, CurrentProgramInfo);
939 else if (MFI->isModuleEntryFunction())
940 emitPALFunctionMetadata(MF);
941 } else if (!STM.isAmdHsaOS()) {
942 EmitProgramInfoSI(MF, CurrentProgramInfo);
943 }
944
945 DumpCodeInstEmitter = nullptr;
946 if (STM.dumpCode()) {
947 // For -dumpcode, get the assembler out of the streamer. This only works
948 // with -filetype=obj.
949 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
950 if (Assembler)
951 DumpCodeInstEmitter = Assembler->getEmitterPtr();
952 }
953
954 DisasmLines.clear();
955 HexLines.clear();
957
959
960 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
961 STM.hasMAIInsts());
962
963 {
966 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
967 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
968 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
969 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
970 OutContext),
971 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
972 OutContext),
973 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
974 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
975 OutContext),
976 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
977 OutContext),
978 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
979 OutContext),
980 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
981 OutContext));
982 }
983
984 // Emit _dvgpr$ symbol when appropriate.
985 emitDVgprSymbol(MF);
986
987 if (isVerbose()) {
988 MCSectionELF *CommentSection =
989 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
990 OutStreamer->switchSection(CommentSection);
991
992 if (!MFI->isEntryFunction()) {
994 OutStreamer->emitRawComment(" Function info:", false);
995
996 emitCommonFunctionComments(
997 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
998 ->getVariableValue(),
999 STM.hasMAIInsts() ? RI.getSymbol(CurrentFnSym->getName(),
1000 RIK::RIK_NumAGPR, OutContext)
1001 ->getVariableValue()
1002 : nullptr,
1003 RI.createTotalNumVGPRs(MF, Ctx),
1004 RI.createTotalNumSGPRs(
1005 MF,
1006 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
1007 Ctx),
1008 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
1009 OutContext)
1010 ->getVariableValue(),
1011 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1012 return false;
1013 }
1014
1015 OutStreamer->emitRawComment(" Kernel info:", false);
1016 emitCommonFunctionComments(
1017 CurrentProgramInfo.NumArchVGPR,
1018 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
1019 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
1020 CurrentProgramInfo.ScratchSize,
1021 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1022
1023 OutStreamer->emitRawComment(
1024 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
1025 OutStreamer->emitRawComment(
1026 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
1027 OutStreamer->emitRawComment(
1028 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
1029 " bytes/workgroup (compile time only)",
1030 false);
1031
1032 OutStreamer->emitRawComment(
1033 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
1034
1035 OutStreamer->emitRawComment(
1036 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
1037
1038 OutStreamer->emitRawComment(
1039 " NumSGPRsForWavesPerEU: " +
1040 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
1041 false);
1042 OutStreamer->emitRawComment(
1043 " NumVGPRsForWavesPerEU: " +
1044 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
1045 false);
1046
1047 if (STM.hasGFX90AInsts()) {
1048 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
1049 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
1050 AdjustedAccum = MCBinaryExpr::createMul(
1051 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
1052 OutStreamer->emitRawComment(
1053 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
1054 }
1055
1056 if (STM.hasGFX1250Insts())
1057 OutStreamer->emitRawComment(
1058 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
1059 false);
1060
1061 OutStreamer->emitRawComment(
1062 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
1063
1064 OutStreamer->emitRawComment(
1065 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
1066
1067 OutStreamer->emitRawComment(
1068 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
1069 getMCExprStr(CurrentProgramInfo.ScratchEnable),
1070 false);
1071 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
1072 Twine(CurrentProgramInfo.UserSGPR),
1073 false);
1074 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
1075 Twine(CurrentProgramInfo.TrapHandlerEnable),
1076 false);
1077 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
1078 Twine(CurrentProgramInfo.TGIdXEnable),
1079 false);
1080 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
1081 Twine(CurrentProgramInfo.TGIdYEnable),
1082 false);
1083 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
1084 Twine(CurrentProgramInfo.TGIdZEnable),
1085 false);
1086 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
1087 Twine(CurrentProgramInfo.TIdIGCompCount),
1088 false);
1089
1090 [[maybe_unused]] int64_t PGMRSrc3;
1092 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
1093 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
1094 static_cast<uint64_t>(PGMRSrc3) == 0));
1095 if (STM.hasGFX90AInsts()) {
1096 OutStreamer->emitRawComment(
1097 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
1098 getMCExprStr(MCKernelDescriptor::bits_get(
1099 CurrentProgramInfo.ComputePGMRSrc3,
1100 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
1101 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
1102 false);
1103 OutStreamer->emitRawComment(
1104 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
1105 getMCExprStr(MCKernelDescriptor::bits_get(
1106 CurrentProgramInfo.ComputePGMRSrc3,
1107 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
1108 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
1109 false);
1110 }
1111 }
1112
1113 if (DumpCodeInstEmitter) {
1114
1115 OutStreamer->switchSection(
1116 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
1117
1118 for (size_t i = 0; i < DisasmLines.size(); ++i) {
1119 std::string Comment = "\n";
1120 if (!HexLines[i].empty()) {
1121 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
1122 Comment += " ; " + HexLines[i] + "\n";
1123 }
1124
1125 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
1126 OutStreamer->emitBytes(StringRef(Comment));
1127 }
1128 }
1129
1130 return false;
1131}
1132
1133// When appropriate, add a _dvgpr$ symbol, with the value of the function
1134// symbol, plus an offset encoding one less than the number of VGPR blocks used
1135// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
1136// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
1137// used by a front-end to have functions that are chained rather than called,
1138// and a dispatcher that dynamically resizes the VGPR count before dispatching
1139// to a function.
1140void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
1142 if (MFI.isDynamicVGPREnabled() &&
1144 MCContext &Ctx = MF.getContext();
1145 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
1146
1147 const MCExpr *EncodedBlocks;
1148 MCValue NumVGPRs;
1149 if (CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
1150 NumVGPRs, nullptr) &&
1151 NumVGPRs.isAbsolute()) {
1152
1153 // Calculate number of VGPR blocks.
1154 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
1155 unsigned NumBlocks =
1156 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
1157
1158 if (NumBlocks > AMDGPU::IsaInfo::MaxDynamicVGPRBlocks) {
1160 {}, "DVGPR block count " + Twine(NumBlocks) +
1161 " exceeds maximum of " +
1163 " for __dvgpr$ symbol for '" +
1164 Twine(CurrentFnSym->getName()) + "'");
1165 return;
1166 }
1167 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
1168 EncodedBlocks = MCConstantExpr::create(EncodedNumBlocks, Ctx);
1169 } else {
1170 // Value not yet available so build a symbolic MCExpr:
1171 // ((alignTo(max(NumVGPRs, 1), BlockSize) / BlockSize - 1) << 3
1172 const MCExpr *One = MCConstantExpr::create(1, Ctx);
1173 const MCExpr *BlockSizeConst = MCConstantExpr::create(BlockSize, Ctx);
1174 const MCExpr *MaxVGPRs = AMDGPUMCExpr::createMax(
1175 {CurrentProgramInfo.NumVGPRsForWavesPerEU, One}, Ctx);
1176 const MCExpr *NumBlocks = MCBinaryExpr::createDiv(
1177 AMDGPUMCExpr::createAlignTo(MaxVGPRs, BlockSizeConst, Ctx),
1178 BlockSizeConst, Ctx);
1179 EncodedBlocks =
1181 MCConstantExpr::create(3, Ctx), Ctx);
1182 }
1183
1184 // Add to function symbol to create _dvgpr$ symbol.
1185 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
1186 MCSymbolRefExpr::create(CurrentFnSym, Ctx), EncodedBlocks, Ctx);
1187 MCSymbol *DVgprFuncSym =
1188 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
1189 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
1190 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
1191 emitLinkage(&MF.getFunction(), DVgprFuncSym);
1192 }
1193}
1194
1195// TODO: Fold this into emitFunctionBodyStart.
1196void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
1197 // In the beginning all features are either 'Any' or 'NotSupported',
1198 // depending on global target features. This will cover empty modules.
1200 getGlobalSTI()->getFeatureString());
1201
1202 // If module is empty, we are done.
1203 if (M.empty())
1204 return;
1205
1206 // If module is not empty, need to find first 'Off' or 'On' feature
1207 // setting per feature from functions in module.
1208 for (auto &F : M) {
1209 auto &TSTargetID = getTargetStreamer()->getTargetID();
1210 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
1211 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
1212 break;
1213
1214 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
1215 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
1216 if (TSTargetID->isXnackSupported())
1217 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
1218 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
1219 if (TSTargetID->isSramEccSupported())
1220 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
1221 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
1222 }
1223}
1224
1225// AccumOffset computed for the MCExpr equivalent of:
1226// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
1227static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
1228 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
1229 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
1230
1231 // Can't be lower than 1 for subsequent alignTo.
1232 const MCExpr *MaximumTaken =
1233 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
1234
1235 // Practically, it's computing divideCeil(MaximumTaken, 4).
1236 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
1237 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
1238 Ctx);
1239
1240 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
1241}
1242
1243void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1244 const MachineFunction &MF) {
1245 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1246 MCContext &Ctx = MF.getContext();
1247
1248 auto CreateExpr = [&Ctx](int64_t Value) {
1249 return MCConstantExpr::create(Value, Ctx);
1250 };
1251
1252 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1253 int64_t Val;
1254 if (Value->evaluateAsAbsolute(Val)) {
1255 Res = Val;
1256 return true;
1257 }
1258 return false;
1259 };
1260
1261 auto GetSymRefExpr =
1262 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1263 MCSymbol *Sym = RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext);
1264 return MCSymbolRefExpr::create(Sym, Ctx);
1265 };
1266
1268 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1269 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1271 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1272
1273 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1274 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1275 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1276 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1277 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1278 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1279 ProgInfo.DynamicCallStack =
1280 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1281 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1282
1283 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1284 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1285 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1286 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1287
1288 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1289
1290 // The calculations related to SGPR/VGPR blocks are
1291 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1292 // unified.
1293 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1294 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1295 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1296
1297 // Check the addressable register limit before we add ExtraSGPRs.
1299 !STM.hasSGPRInitBug()) {
1300 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1301 uint64_t NumSgpr;
1302 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1303 NumSgpr > MaxAddressableNumSGPRs) {
1304 // This can happen due to a compiler bug or when using inline asm.
1305 LLVMContext &Ctx = MF.getFunction().getContext();
1306 Ctx.diagnose(DiagnosticInfoResourceLimit(
1307 MF.getFunction(), "addressable scalar registers", NumSgpr,
1308 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1309 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1310 }
1311 }
1312
1313 // Account for extra SGPRs and VGPRs reserved for debugger use.
1314 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1315
1316 const Function &F = MF.getFunction();
1317
1318 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1319 // dispatch registers as function args.
1320 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1321 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1322
1323 if (WaveDispatchNumSGPR) {
1325 {ProgInfo.NumSGPR,
1326 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1327 Ctx)},
1328 Ctx);
1329 }
1330
1331 if (WaveDispatchNumVGPR) {
1333 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1334
1336 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1337 }
1338
1339 // Adjust number of registers used to meet default/requested minimum/maximum
1340 // number of waves per execution unit request.
1341 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1342 ProgInfo.NumSGPRsForWavesPerEU =
1343 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1344 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1345 Ctx);
1346 ProgInfo.NumVGPRsForWavesPerEU =
1347 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1348 CreateExpr(STM.getMinNumVGPRs(
1349 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1350 Ctx);
1351
1353 STM.hasSGPRInitBug()) {
1354 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1355 uint64_t NumSgpr;
1356 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1357 NumSgpr > MaxAddressableNumSGPRs) {
1358 // This can happen due to a compiler bug or when using inline asm to use
1359 // the registers which are usually reserved for vcc etc.
1360 LLVMContext &Ctx = MF.getFunction().getContext();
1361 Ctx.diagnose(DiagnosticInfoResourceLimit(
1362 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1364 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1365 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1366 }
1367 }
1368
1369 if (STM.hasSGPRInitBug()) {
1370 ProgInfo.NumSGPR =
1372 ProgInfo.NumSGPRsForWavesPerEU =
1374 }
1375
1376 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1377 LLVMContext &Ctx = MF.getFunction().getContext();
1378 Ctx.diagnose(DiagnosticInfoResourceLimit(
1379 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1381 }
1382
1383 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1384 LLVMContext &Ctx = MF.getFunction().getContext();
1385 Ctx.diagnose(DiagnosticInfoResourceLimit(
1386 MF.getFunction(), "local memory", MFI->getLDSSize(),
1388 }
1389 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1390 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1391 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1392 unsigned Granule) {
1393 const MCExpr *OneConst = CreateExpr(1ul);
1394 const MCExpr *GranuleConst = CreateExpr(Granule);
1395 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1396 const MCExpr *AlignToGPR =
1397 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1398 const MCExpr *DivGPR =
1399 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1400 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1401 return SubGPR;
1402 };
1403 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1405 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1406 } else {
1407 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1409 }
1410 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1412
1413 const SIModeRegisterDefaults Mode = MFI->getMode();
1414
1415 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1416 // register.
1417 ProgInfo.FloatMode = getFPMode(Mode);
1418
1419 ProgInfo.IEEEMode = Mode.IEEE;
1420
1421 // Make clamp modifier on NaN input returns 0.
1422 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1423
1424 unsigned LDSAlignShift = 8;
1425 switch (getLdsDwGranularity(STM)) {
1426 case 512:
1427 case 320:
1428 LDSAlignShift = 11;
1429 break;
1430 case 128:
1431 LDSAlignShift = 9;
1432 break;
1433 case 64:
1434 LDSAlignShift = 8;
1435 break;
1436 default:
1437 llvm_unreachable("invald LDS block size");
1438 }
1439
1440 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1441 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1442
1443 ProgInfo.LDSSize = MFI->getLDSSize();
1444 ProgInfo.LDSBlocks =
1445 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1446
1447 // The MCExpr equivalent of divideCeil.
1448 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1449 const MCExpr *Ceil =
1450 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1451 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1452 };
1453
1454 // Scratch is allocated in 64-dword or 256-dword blocks.
1455 unsigned ScratchAlignShift =
1456 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1457 // We need to program the hardware with the amount of scratch memory that
1458 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1459 // scratch memory used per thread.
1460 ProgInfo.ScratchBlocks = DivideCeil(
1462 CreateExpr(STM.getWavefrontSize()), Ctx),
1463 CreateExpr(1ULL << ScratchAlignShift));
1464
1465 if (STM.supportsWGP()) {
1466 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1467 }
1468
1469 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1470 ProgInfo.MemOrdered = 1;
1471 ProgInfo.FwdProgress = !F.hasFnAttribute("amdgpu-no-fwd-progress");
1472 }
1473
1474 // 0 = X, 1 = XY, 2 = XYZ
1475 unsigned TIDIGCompCnt = 0;
1476 if (MFI->hasWorkItemIDZ())
1477 TIDIGCompCnt = 2;
1478 else if (MFI->hasWorkItemIDY())
1479 TIDIGCompCnt = 1;
1480
1481 // The private segment wave byte offset is the last of the system SGPRs. We
1482 // initially assumed it was allocated, and may have used it. It shouldn't harm
1483 // anything to disable it if we know the stack isn't used here. We may still
1484 // have emitted code reading it to initialize scratch, but if that's unused
1485 // reading garbage should be OK.
1488 MCConstantExpr::create(0, Ctx), Ctx),
1489 ProgInfo.DynamicCallStack, Ctx);
1490
1491 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1492 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1493 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1494 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1495 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1496 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1497 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1498 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1499 ProgInfo.EXCPEnMSB = 0;
1500 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1501 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1502 ProgInfo.EXCPEnable = 0;
1503
1504 if (STM.hasGFX90AInsts()) {
1505 ProgInfo.ComputePGMRSrc3 =
1506 setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1507 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1508 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx);
1509 ProgInfo.ComputePGMRSrc3 =
1510 setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1511 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1512 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx);
1513 }
1514
1515 if (STM.hasGFX1250Insts())
1516 ProgInfo.ComputePGMRSrc3 =
1517 setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1518 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1519 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx);
1520
1521 ProgInfo.Occupancy = createOccupancy(
1522 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1524 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1525
1526 const auto [MinWEU, MaxWEU] =
1527 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1528 uint64_t Occupancy;
1529 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1530 DiagnosticInfoOptimizationFailure Diag(
1531 F, F.getSubprogram(),
1532 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1533 "'" +
1534 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1535 ", final occupancy is " + Twine(Occupancy));
1536 F.getContext().diagnose(Diag);
1537 }
1538}
1539
1540static unsigned getRsrcReg(CallingConv::ID CallConv) {
1541 switch (CallConv) {
1542 default:
1543 [[fallthrough]];
1558 }
1559}
1560
1561void AMDGPUAsmPrinter::EmitProgramInfoSI(
1562 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1563 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1564 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1565 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1566 MCContext &Ctx = MF.getContext();
1567
1568 // (((Value) & Mask) << Shift)
1569 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1570 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1571 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1573 shft, Ctx);
1574 };
1575
1576 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1577 int64_t Val;
1578 if (Value->evaluateAsAbsolute(Val))
1579 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1580 else
1581 OutStreamer->emitValue(Value, Size);
1582 };
1583
1584 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1586
1587 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1588 /*Size=*/4);
1589
1591 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx),
1592 /*Size=*/4);
1593
1595
1596 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1597 // appropriate generation.
1598 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1599 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1600 /*Mask=*/0x3FFFF, /*Shift=*/12),
1601 /*Size=*/4);
1602 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1603 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1604 /*Mask=*/0x7FFF, /*Shift=*/12),
1605 /*Size=*/4);
1606 } else {
1607 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1608 /*Mask=*/0x1FFF, /*Shift=*/12),
1609 /*Size=*/4);
1610 }
1611
1612 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1613 // 0" comment but I don't see a corresponding field in the register spec.
1614 } else {
1615 OutStreamer->emitInt32(RsrcReg);
1616
1617 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1618 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1619 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1620 MF.getContext());
1621 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1623
1624 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1625 // appropriate generation.
1626 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1627 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1628 /*Mask=*/0x3FFFF, /*Shift=*/12),
1629 /*Size=*/4);
1630 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1631 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1632 /*Mask=*/0x7FFF, /*Shift=*/12),
1633 /*Size=*/4);
1634 } else {
1635 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1636 /*Mask=*/0x1FFF, /*Shift=*/12),
1637 /*Size=*/4);
1638 }
1639 }
1640
1641 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1643 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1644 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1645 : CurrentProgramInfo.LDSBlocks;
1646 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1648 OutStreamer->emitInt32(MFI->getPSInputEnable());
1650 OutStreamer->emitInt32(MFI->getPSInputAddr());
1651 }
1652
1653 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1654 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1655 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1656 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1657}
1658
1659// Helper function to add common PAL Metadata 3.0+
1661 const SIProgramInfo &CurrentProgramInfo,
1662 CallingConv::ID CC, const GCNSubtarget &ST,
1663 unsigned DynamicVGPRBlockSize) {
1664 if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1665 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1666
1667 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1668 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1669 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1670
1671 if (AMDGPU::isCompute(CC)) {
1672 MD->setHwStage(CC, ".trap_present",
1673 (bool)CurrentProgramInfo.TrapHandlerEnable);
1674 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1675
1676 if (DynamicVGPRBlockSize != 0)
1677 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1678 }
1679
1681 CC, ".lds_size",
1682 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1683 sizeof(uint32_t)));
1684}
1685
1686// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1687// is AMDPAL. It stores each compute/SPI register setting and other PAL
1688// metadata items into the PALMD::Metadata, combining with any provided by the
1689// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1690// is then written as a single block in the .note section.
1691void AMDGPUAsmPrinter::EmitPALMetadata(
1692 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1693 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1694 auto CC = MF.getFunction().getCallingConv();
1695 auto *MD = getTargetStreamer()->getPALMetadata();
1696 auto &Ctx = MF.getContext();
1697
1698 MD->setEntryPoint(CC, MF.getFunction().getName());
1699 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1700
1701 // For targets that support dynamic VGPRs, set the number of saved dynamic
1702 // VGPRs (if any) in the PAL metadata.
1703 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1704 if (MFI->isDynamicVGPREnabled() &&
1706 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1708
1709 // Only set AGPRs for supported devices
1710 if (STM.hasMAIInsts()) {
1711 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1712 }
1713
1714 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1715 if (MD->getPALMajorVersion() < 3) {
1716 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1717 if (AMDGPU::isCompute(CC)) {
1718 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx), Ctx);
1719 } else {
1720 const MCExpr *HasScratchBlocks =
1721 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1722 MCConstantExpr::create(0, Ctx), Ctx);
1723 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1724 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1725 }
1726 } else {
1727 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1728 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1729 CurrentProgramInfo.ScratchEnable);
1730 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1732 }
1733
1734 // ScratchSize is in bytes, 16 aligned.
1735 MD->setScratchSize(
1736 CC,
1737 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1738 MCConstantExpr::create(16, Ctx), Ctx),
1739 Ctx);
1740
1741 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1742 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1743 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1744 : CurrentProgramInfo.LDSBlocks;
1745 if (MD->getPALMajorVersion() < 3) {
1746 MD->setRsrc2(
1747 CC,
1749 Ctx);
1750 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1751 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1752 } else {
1753 // Graphics registers
1754 const unsigned ExtraLdsDwGranularity =
1755 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1756 MD->setGraphicsRegisters(
1757 ".ps_extra_lds_size",
1758 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1759
1760 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1761 static StringLiteral const PsInputFields[] = {
1762 ".persp_sample_ena", ".persp_center_ena",
1763 ".persp_centroid_ena", ".persp_pull_model_ena",
1764 ".linear_sample_ena", ".linear_center_ena",
1765 ".linear_centroid_ena", ".line_stipple_tex_ena",
1766 ".pos_x_float_ena", ".pos_y_float_ena",
1767 ".pos_z_float_ena", ".pos_w_float_ena",
1768 ".front_face_ena", ".ancillary_ena",
1769 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1770 unsigned PSInputEna = MFI->getPSInputEnable();
1771 unsigned PSInputAddr = MFI->getPSInputAddr();
1772 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1773 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1774 (bool)((PSInputEna >> Idx) & 1));
1775 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1776 (bool)((PSInputAddr >> Idx) & 1));
1777 }
1778 }
1779 }
1780
1781 // For version 3 and above the wave front size is already set in the metadata
1782 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1783 MD->setWave32(MF.getFunction().getCallingConv());
1784}
1785
1786void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1787 auto *MD = getTargetStreamer()->getPALMetadata();
1788 const MachineFrameInfo &MFI = MF.getFrameInfo();
1789 StringRef FnName = MF.getFunction().getName();
1790 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1791 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1792 MCContext &Ctx = MF.getContext();
1793
1794 if (MD->getPALMajorVersion() < 3) {
1795 // Set compute registers
1796 MD->setRsrc1(
1798 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1799 MD->setRsrc2(CallingConv::AMDGPU_CS,
1800 CurrentProgramInfo.getComputePGMRSrc2(ST, Ctx), Ctx);
1801 } else {
1803 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1804 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1805 }
1806
1807 // Set optional info
1808 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1809 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1810 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1811}
1812
1813// This is supposed to be log2(Size)
1815 switch (Size) {
1816 case 4:
1817 return AMD_ELEMENT_4_BYTES;
1818 case 8:
1819 return AMD_ELEMENT_8_BYTES;
1820 case 16:
1821 return AMD_ELEMENT_16_BYTES;
1822 default:
1823 llvm_unreachable("invalid private_element_size");
1824 }
1825}
1826
1827void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1828 const SIProgramInfo &CurrentProgramInfo,
1829 const MachineFunction &MF) const {
1830 const Function &F = MF.getFunction();
1831 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1832 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1833
1834 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1835 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1836 MCContext &Ctx = MF.getContext();
1837
1838 Out.initDefault(STM, Ctx, /*InitMCExpr=*/false);
1839
1841 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1843 CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx);
1845
1846 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1847
1849 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1850
1851 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1852 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1854 }
1855
1856 if (UserSGPRInfo.hasDispatchPtr())
1858
1859 if (UserSGPRInfo.hasQueuePtr())
1861
1862 if (UserSGPRInfo.hasKernargSegmentPtr())
1864
1865 if (UserSGPRInfo.hasDispatchID())
1867
1868 if (UserSGPRInfo.hasFlatScratchInit())
1870
1871 if (UserSGPRInfo.hasPrivateSegmentSize())
1873
1874 if (STM.isXNACKEnabled())
1876
1877 Align MaxKernArgAlign;
1878 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1879 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1880 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1881 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1882 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1883
1884 // kernarg_segment_alignment is specified as log of the alignment.
1885 // The minimum alignment is 16.
1886 // FIXME: The metadata treats the minimum as 4?
1887 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1888}
1889
1891 const char *ExtraCode, raw_ostream &O) {
1892 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1893 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1894 return false;
1895
1896 if (ExtraCode && ExtraCode[0]) {
1897 if (ExtraCode[1] != 0)
1898 return true; // Unknown modifier.
1899
1900 switch (ExtraCode[0]) {
1901 case 'r':
1902 break;
1903 default:
1904 return true;
1905 }
1906 }
1907
1908 // TODO: Should be able to support other operand types like globals.
1909 const MachineOperand &MO = MI->getOperand(OpNo);
1910 if (MO.isReg()) {
1912 *MF->getSubtarget().getRegisterInfo());
1913 return false;
1914 }
1915 if (MO.isImm()) {
1916 int64_t Val = MO.getImm();
1918 O << Val;
1919 } else if (isUInt<16>(Val)) {
1920 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1921 } else if (isUInt<32>(Val)) {
1922 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1923 } else {
1924 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1925 }
1926 return false;
1927 }
1928 return true;
1929}
1930
1938
1939void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1940 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1941 bool isModuleEntryFunction, bool hasMAIInsts) {
1942 if (!ORE)
1943 return;
1944
1945 const char *Name = "kernel-resource-usage";
1946 const char *Indent = " ";
1947
1948 // If the remark is not specifically enabled, do not output to yaml
1950 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1951 return;
1952
1953 // Currently non-kernel functions have no resources to emit.
1955 return;
1956
1957 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1958 StringRef RemarkLabel, auto Argument) {
1959 // Add an indent for every line besides the line with the kernel name. This
1960 // makes it easier to tell which resource usage go with which kernel since
1961 // the kernel name will always be displayed first.
1962 std::string LabelStr = RemarkLabel.str() + ": ";
1963 if (RemarkName != "FunctionName")
1964 LabelStr = Indent + LabelStr;
1965
1966 ORE->emit([&]() {
1967 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1969 &MF.front())
1970 << LabelStr << ore::NV(RemarkName, Argument);
1971 });
1972 };
1973
1974 // FIXME: Formatting here is pretty nasty because clang does not accept
1975 // newlines from diagnostics. This forces us to emit multiple diagnostic
1976 // remarks to simulate newlines. If and when clang does accept newlines, this
1977 // formatting should be aggregated into one remark with newlines to avoid
1978 // printing multiple diagnostic location and diag opts.
1979 EmitResourceUsageRemark("FunctionName", "Function Name",
1980 MF.getFunction().getName());
1981 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1982 getMCExprStr(CurrentProgramInfo.NumSGPR));
1983 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1984 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1985 if (hasMAIInsts) {
1986 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1987 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1988 }
1989 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1990 getMCExprStr(CurrentProgramInfo.ScratchSize));
1991 int64_t DynStack;
1992 bool DynStackEvaluatable =
1993 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1994 StringRef DynamicStackStr =
1995 DynStackEvaluatable && DynStack ? "True" : "False";
1996 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1997 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1998 getMCExprStr(CurrentProgramInfo.Occupancy));
1999 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
2000 CurrentProgramInfo.SGPRSpill);
2001 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
2002 CurrentProgramInfo.VGPRSpill);
2003 if (isModuleEntryFunction)
2004 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
2005 CurrentProgramInfo.LDSSize);
2006}
2007
2008char AMDGPUAsmPrinter::ID = 0;
2009
2010INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
2011 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static const MCExpr * setBits(const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Set bits in a kernel descriptor MCExpr field: return ((Dst & ~Mask) | (Value << Shift))
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static std::string computeTypeId(const FunctionType *FTy, const DataLayout &DL)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL, bool IsReturnType)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_ABI
Definition Compiler.h:215
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1358
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1500
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1482
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1394
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1474
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1433
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1495
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1381
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1380
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1389
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1432
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1367
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1493
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1435
#define R_SPILLED_SGPRS
Definition SIDefines.h:1514
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1481
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1492
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1372
#define R_SPILLED_VGPRS
Definition SIDefines.h:1515
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1366
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1391
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1365
StringSet - A set-like wrapper for the StringMap.
static const int BlockSize
Definition TarWriter.cpp:33
static cl::opt< unsigned > CacheLineSize("cache-line-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target cache line size when " "specified by the user."))
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
void endFunction(const MachineFunction *MF)
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx)
Create an expression for instruction prefetch size computation: min(divideCeil(CodeSizeBytes,...
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
virtual void emitAMDGPUInfo(const AMDGPU::InfoSectionData &Data)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, const MCSymbol *MaxNamedBarrier)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Collects and handles AsmPrinter objects required to build debug or EH information.
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:91
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:94
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:109
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:121
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:128
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:112
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:101
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:106
const MCAsmInfo & MAI
Target Asm Printer information.
Definition AsmPrinter.h:97
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:310
MCSymbol * getFunctionEnd() const
Definition AsmPrinter.h:320
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
void addAsmPrinterHandler(std::unique_ptr< AsmPrinterHandler > Handler)
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool empty() const
Definition DenseMap.h:173
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool hasInstPrefSize() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isWave32() const
bool supportsWGP() const
void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, uint32_t &CacheLineSize) const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:337
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:342
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:347
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:407
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:377
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:397
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:362
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:352
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:412
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:427
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:413
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:573
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:661
bool hasInstructions() const
Definition MCSection.h:669
MCContext & getContext() const
Definition MCStreamer.h:326
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:213
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:103
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:272
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVM_ABI unsigned getNumOperands() const
iterator_range< op_iterator > operands()
Definition Metadata.h:1845
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void push_back(const T &Elt)
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< typename Base::iterator, bool > insert(StringRef key)
Definition StringSet.h:39
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:445
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getSGPRAllocGranule(const MCSubtargetInfo &STI)
bool isSGPROccupancyLimited(const MCSubtargetInfo &STI)
unsigned getTotalNumSGPRs(const MCSubtargetInfo &STI)
unsigned getVGPREncodingGranule(const MCSubtargetInfo &STI, std::optional< bool > EnableWavefrontSize32)
static constexpr unsigned MaxDynamicVGPRBlocks
Maximum number of VGPR blocks that can be allocated in dynamic VGPR mode.
unsigned getSGPREncodingGranule(const MCSubtargetInfo &STI)
unsigned getTotalNumVGPRs(const MCSubtargetInfo &STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo &STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1150
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1433
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
Target & getTheGCNTarget()
The target for GCN GPUs.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1916
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:860
#define N
AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo FunctionResourceInfo
void initDefault(const MCSubtargetInfo &STI, MCContext &Ctx, bool InitMCExpr=true)
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
const MCExpr * getComputePGMRSrc2(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.