LLVM 19.0.0git
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUIGroupLP.h"
21#include "AMDGPUMacroFusion.h"
22#include "AMDGPURegBankSelect.h"
27#include "GCNSchedStrategy.h"
28#include "GCNVOPDUtils.h"
29#include "R600.h"
31#include "R600TargetMachine.h"
33#include "SIMachineScheduler.h"
45#include "llvm/CodeGen/Passes.h"
48#include "llvm/IR/IntrinsicsAMDGPU.h"
49#include "llvm/IR/PassManager.h"
55#include "llvm/Transforms/IPO.h"
65#include <optional>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70namespace {
71class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
72public:
73 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
75};
76
77class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
78public:
79 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
81};
82
83static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
84 const TargetRegisterClass &RC) {
85 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
86}
87
88static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
89 const TargetRegisterClass &RC) {
90 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
91}
92
93
94/// -{sgpr|vgpr}-regalloc=... command line option.
95static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
96
97/// A dummy default pass factory indicates whether the register allocator is
98/// overridden on the command line.
99static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
100static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
101
102static SGPRRegisterRegAlloc
103defaultSGPRRegAlloc("default",
104 "pick SGPR register allocator based on -O option",
106
107static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
109SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
110 cl::desc("Register allocator to use for SGPRs"));
111
112static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
114VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
115 cl::desc("Register allocator to use for VGPRs"));
116
117
118static void initializeDefaultSGPRRegisterAllocatorOnce() {
119 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
120
121 if (!Ctor) {
122 Ctor = SGPRRegAlloc;
123 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
124 }
125}
126
127static void initializeDefaultVGPRRegisterAllocatorOnce() {
128 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
129
130 if (!Ctor) {
131 Ctor = VGPRRegAlloc;
132 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
133 }
134}
135
136static FunctionPass *createBasicSGPRRegisterAllocator() {
137 return createBasicRegisterAllocator(onlyAllocateSGPRs);
138}
139
140static FunctionPass *createGreedySGPRRegisterAllocator() {
141 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
142}
143
144static FunctionPass *createFastSGPRRegisterAllocator() {
145 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
146}
147
148static FunctionPass *createBasicVGPRRegisterAllocator() {
149 return createBasicRegisterAllocator(onlyAllocateVGPRs);
150}
151
152static FunctionPass *createGreedyVGPRRegisterAllocator() {
153 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
154}
155
156static FunctionPass *createFastVGPRRegisterAllocator() {
157 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
158}
159
160static SGPRRegisterRegAlloc basicRegAllocSGPR(
161 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
162static SGPRRegisterRegAlloc greedyRegAllocSGPR(
163 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
164
165static SGPRRegisterRegAlloc fastRegAllocSGPR(
166 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
167
168
169static VGPRRegisterRegAlloc basicRegAllocVGPR(
170 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
171static VGPRRegisterRegAlloc greedyRegAllocVGPR(
172 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
173
174static VGPRRegisterRegAlloc fastRegAllocVGPR(
175 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
176}
177
178static cl::opt<bool>
180 cl::desc("Run early if-conversion"),
181 cl::init(false));
182
183static cl::opt<bool>
184OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
185 cl::desc("Run pre-RA exec mask optimizations"),
186 cl::init(true));
187
188static cl::opt<bool>
189 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
190 cl::desc("Lower GPU ctor / dtors to globals on the device."),
191 cl::init(true), cl::Hidden);
192
193// Option to disable vectorizer for tests.
195 "amdgpu-load-store-vectorizer",
196 cl::desc("Enable load store vectorizer"),
197 cl::init(true),
198 cl::Hidden);
199
200// Option to control global loads scalarization
202 "amdgpu-scalarize-global-loads",
203 cl::desc("Enable global load scalarization"),
204 cl::init(true),
205 cl::Hidden);
206
207// Option to run internalize pass.
209 "amdgpu-internalize-symbols",
210 cl::desc("Enable elimination of non-kernel functions and unused globals"),
211 cl::init(false),
212 cl::Hidden);
213
214// Option to inline all early.
216 "amdgpu-early-inline-all",
217 cl::desc("Inline all functions early"),
218 cl::init(false),
219 cl::Hidden);
220
222 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
223 cl::desc("Enable removal of functions when they"
224 "use features not supported by the target GPU"),
225 cl::init(true));
226
228 "amdgpu-sdwa-peephole",
229 cl::desc("Enable SDWA peepholer"),
230 cl::init(true));
231
233 "amdgpu-dpp-combine",
234 cl::desc("Enable DPP combiner"),
235 cl::init(true));
236
237// Enable address space based alias analysis
239 cl::desc("Enable AMDGPU Alias Analysis"),
240 cl::init(true));
241
242// Option to run late CFG structurizer
244 "amdgpu-late-structurize",
245 cl::desc("Enable late CFG structurization"),
247 cl::Hidden);
248
249// Disable structurizer-based control-flow lowering in order to test convergence
250// control tokens. This should eventually be replaced by the wave-transform.
252 "amdgpu-disable-structurizer",
253 cl::desc("Disable structurizer for experiments; produces unusable code"),
255
256// Enable lib calls simplifications
258 "amdgpu-simplify-libcall",
259 cl::desc("Enable amdgpu library simplifications"),
260 cl::init(true),
261 cl::Hidden);
262
264 "amdgpu-ir-lower-kernel-arguments",
265 cl::desc("Lower kernel argument loads in IR pass"),
266 cl::init(true),
267 cl::Hidden);
268
270 "amdgpu-reassign-regs",
271 cl::desc("Enable register reassign optimizations on gfx10+"),
272 cl::init(true),
273 cl::Hidden);
274
276 "amdgpu-opt-vgpr-liverange",
277 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
278 cl::init(true), cl::Hidden);
279
281 "amdgpu-atomic-optimizer-strategy",
282 cl::desc("Select DPP or Iterative strategy for scan"),
283 cl::init(ScanOptions::Iterative),
285 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
286 clEnumValN(ScanOptions::Iterative, "Iterative",
287 "Use Iterative approach for scan"),
288 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
289
290// Enable Mode register optimization
292 "amdgpu-mode-register",
293 cl::desc("Enable mode register pass"),
294 cl::init(true),
295 cl::Hidden);
296
297// Enable GFX11.5+ s_singleuse_vdst insertion
298static cl::opt<bool>
299 EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
300 cl::desc("Enable s_singleuse_vdst insertion"),
301 cl::init(false), cl::Hidden);
302
303// Enable GFX11+ s_delay_alu insertion
304static cl::opt<bool>
305 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
306 cl::desc("Enable s_delay_alu insertion"),
307 cl::init(true), cl::Hidden);
308
309// Enable GFX11+ VOPD
310static cl::opt<bool>
311 EnableVOPD("amdgpu-enable-vopd",
312 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
313 cl::init(true), cl::Hidden);
314
315// Option is used in lit tests to prevent deadcoding of patterns inspected.
316static cl::opt<bool>
317EnableDCEInRA("amdgpu-dce-in-ra",
318 cl::init(true), cl::Hidden,
319 cl::desc("Enable machine DCE inside regalloc"));
320
321static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
322 cl::desc("Adjust wave priority"),
323 cl::init(false), cl::Hidden);
324
326 "amdgpu-scalar-ir-passes",
327 cl::desc("Enable scalar IR passes"),
328 cl::init(true),
329 cl::Hidden);
330
332 "amdgpu-enable-structurizer-workarounds",
333 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
334 cl::Hidden);
335
337 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
339 cl::Hidden);
340
342 "amdgpu-enable-pre-ra-optimizations",
343 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
344 cl::Hidden);
345
347 "amdgpu-enable-promote-kernel-arguments",
348 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
349 cl::Hidden, cl::init(true));
350
352 "amdgpu-enable-image-intrinsic-optimizer",
353 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
354 cl::Hidden);
355
356static cl::opt<bool>
357 EnableLoopPrefetch("amdgpu-loop-prefetch",
358 cl::desc("Enable loop data prefetch on AMDGPU"),
359 cl::Hidden, cl::init(false));
360
362 "amdgpu-enable-max-ilp-scheduling-strategy",
363 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
364 cl::Hidden, cl::init(false));
365
367 "amdgpu-enable-rewrite-partial-reg-uses",
368 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
369 cl::Hidden);
370
372 "amdgpu-enable-hipstdpar",
373 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
374 cl::Hidden);
375
377 // Register the target
380
455}
456
457static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
458 return std::make_unique<AMDGPUTargetObjectFile>();
459}
460
462 return new SIScheduleDAGMI(C);
463}
464
465static ScheduleDAGInstrs *
467 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
468 ScheduleDAGMILive *DAG =
469 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
470 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
471 if (ST.shouldClusterStores())
472 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
473 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
474 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
475 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
476 return DAG;
477}
478
479static ScheduleDAGInstrs *
481 ScheduleDAGMILive *DAG =
482 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
483 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
484 return DAG;
485}
486
487static ScheduleDAGInstrs *
489 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
490 auto DAG = new GCNIterativeScheduler(C,
492 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
493 if (ST.shouldClusterStores())
494 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
495 return DAG;
496}
497
499 return new GCNIterativeScheduler(C,
501}
502
503static ScheduleDAGInstrs *
505 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
506 auto DAG = new GCNIterativeScheduler(C,
508 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
509 if (ST.shouldClusterStores())
510 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
511 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
512 return DAG;
513}
514
516SISchedRegistry("si", "Run SI's custom scheduler",
518
521 "Run GCN scheduler to maximize occupancy",
523
525 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
527
529 "gcn-iterative-max-occupancy-experimental",
530 "Run GCN scheduler to maximize occupancy (experimental)",
532
534 "gcn-iterative-minreg",
535 "Run GCN iterative scheduler for minimal register usage (experimental)",
537
539 "gcn-iterative-ilp",
540 "Run GCN iterative scheduler for ILP scheduling (experimental)",
542
544 if (TT.getArch() == Triple::r600) {
545 // 32-bit pointers.
546 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
547 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
548 }
549
550 // 32-bit private, local, and region pointers. 64-bit global, constant and
551 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
552 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
553 // (address space 7), and 128-bit non-integral buffer resourcees (address
554 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
555 // like getelementptr.
556 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
557 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
558 "v32:32-v48:64-v96:"
559 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
560 "G1-ni:7:8:9";
561}
562
565 if (!GPU.empty())
566 return GPU;
567
568 // Need to default to a target with flat support for HSA.
569 if (TT.getArch() == Triple::amdgcn)
570 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
571
572 return "r600";
573}
574
575static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
576 // The AMDGPU toolchain only supports generating shared objects, so we
577 // must always use PIC.
578 return Reloc::PIC_;
579}
580
582 StringRef CPU, StringRef FS,
583 const TargetOptions &Options,
584 std::optional<Reloc::Model> RM,
585 std::optional<CodeModel::Model> CM,
586 CodeGenOptLevel OptLevel)
589 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
590 TLOF(createTLOF(getTargetTriple())) {
591 initAsmInfo();
592 if (TT.getArch() == Triple::amdgcn) {
593 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
595 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
597 }
598}
599
604
606
608 Attribute GPUAttr = F.getFnAttribute("target-cpu");
609 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
610}
611
613 Attribute FSAttr = F.getFnAttribute("target-features");
614
615 return FSAttr.isValid() ? FSAttr.getValueAsString()
617}
618
619/// Predicate for Internalize pass.
620static bool mustPreserveGV(const GlobalValue &GV) {
621 if (const Function *F = dyn_cast<Function>(&GV))
622 return F->isDeclaration() || F->getName().starts_with("__asan_") ||
623 F->getName().starts_with("__sanitizer_") ||
624 AMDGPU::isEntryFunctionCC(F->getCallingConv());
625
627 return !GV.use_empty();
628}
629
632}
633
636 if (Params.empty())
638 Params.consume_front("strategy=");
639 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
640 .Case("dpp", ScanOptions::DPP)
641 .Cases("iterative", "", ScanOptions::Iterative)
642 .Case("none", ScanOptions::None)
643 .Default(std::nullopt);
644 if (Result)
645 return *Result;
646 return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
647}
648
650 PassBuilder &PB, bool PopulateClassToPassNames) {
651
652#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
654
656 [](ModulePassManager &PM, OptimizationLevel Level) {
661 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
662 if (EnableHipStdPar)
664 });
665
667 [](ModulePassManager &PM, OptimizationLevel Level) {
669
670 if (Level == OptimizationLevel::O0)
671 return;
672
674
675 if (InternalizeSymbols) {
678 }
679
682 });
683
685 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
686 if (Level == OptimizationLevel::O0)
687 return;
688
690
691 // Add promote kernel arguments pass to the opt pipeline right before
692 // infer address spaces which is needed to do actual address space
693 // rewriting.
694 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
697
698 // Add infer address spaces pass to the opt pipeline after inlining
699 // but before SROA to increase SROA opportunities.
701
702 // This should run after inlining to have any chance of doing
703 // anything, and before other cleanup optimizations.
705
706 if (Level != OptimizationLevel::O0) {
707 // Promote alloca to vector before SROA and loop unroll. If we
708 // manage to eliminate allocas before unroll we may choose to unroll
709 // less.
711 }
712
713 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
714 });
715
717 [this](ModulePassManager &PM, OptimizationLevel Level) {
718 // We want to support the -lto-partitions=N option as "best effort".
719 // For that, we need to lower LDS earlier in the pipeline before the
720 // module is partitioned for codegen.
723 });
724}
725
726int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
727 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
728 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
729 AddrSpace == AMDGPUAS::REGION_ADDRESS)
730 ? -1
731 : 0;
732}
733
735 unsigned DestAS) const {
736 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
738}
739
741 const auto *LD = dyn_cast<LoadInst>(V);
742 if (!LD)
744
745 // It must be a generic pointer loaded.
746 assert(V->getType()->isPointerTy() &&
747 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
748
749 const auto *Ptr = LD->getPointerOperand();
750 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
752 // For a generic pointer loaded from the constant memory, it could be assumed
753 // as a global pointer since the constant memory is only populated on the
754 // host side. As implied by the offload programming model, only global
755 // pointers could be referenced on the host side.
757}
758
759std::pair<const Value *, unsigned>
761 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
762 switch (II->getIntrinsicID()) {
763 case Intrinsic::amdgcn_is_shared:
764 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
765 case Intrinsic::amdgcn_is_private:
766 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
767 default:
768 break;
769 }
770 return std::pair(nullptr, -1);
771 }
772 // Check the global pointer predication based on
773 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
774 // the order of 'is_shared' and 'is_private' is not significant.
775 Value *Ptr;
776 if (match(
777 const_cast<Value *>(V),
778 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
779 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
780 m_Deferred(Ptr))))))
781 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
782
783 return std::pair(nullptr, -1);
784}
785
786unsigned
788 switch (Kind) {
798 }
800}
801
802//===----------------------------------------------------------------------===//
803// GCN Target Machine (SI+)
804//===----------------------------------------------------------------------===//
805
807 StringRef CPU, StringRef FS,
808 const TargetOptions &Options,
809 std::optional<Reloc::Model> RM,
810 std::optional<CodeModel::Model> CM,
811 CodeGenOptLevel OL, bool JIT)
812 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
813
816 StringRef GPU = getGPUName(F);
818
819 SmallString<128> SubtargetKey(GPU);
820 SubtargetKey.append(FS);
821
822 auto &I = SubtargetMap[SubtargetKey];
823 if (!I) {
824 // This needs to be done before we create a new subtarget since any
825 // creation will depend on the TM and the code generation flags on the
826 // function that reside in TargetOptions.
828 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
829 }
830
831 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
832
833 return I.get();
834}
835
838 return TargetTransformInfo(GCNTTIImpl(this, F));
839}
840
841//===----------------------------------------------------------------------===//
842// AMDGPU Pass Setup
843//===----------------------------------------------------------------------===//
844
845std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
847}
848
849namespace {
850
851class GCNPassConfig final : public AMDGPUPassConfig {
852public:
853 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
854 : AMDGPUPassConfig(TM, PM) {
855 // It is necessary to know the register usage of the entire call graph. We
856 // allow calls without EnableAMDGPUFunctionCalls if they are marked
857 // noinline, so this is always required.
858 setRequiresCodeGenSCCOrder(true);
859 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
860 }
861
862 GCNTargetMachine &getGCNTargetMachine() const {
863 return getTM<GCNTargetMachine>();
864 }
865
867 createMachineScheduler(MachineSchedContext *C) const override;
868
870 createPostMachineScheduler(MachineSchedContext *C) const override {
872 C, std::make_unique<PostGenericScheduler>(C),
873 /*RemoveKillFlags=*/true);
874 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
876 if (ST.shouldClusterStores())
878 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
879 DAG->addMutation(
880 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
881 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
883 return DAG;
884 }
885
886 bool addPreISel() override;
887 void addMachineSSAOptimization() override;
888 bool addILPOpts() override;
889 bool addInstSelector() override;
890 bool addIRTranslator() override;
891 void addPreLegalizeMachineIR() override;
892 bool addLegalizeMachineIR() override;
893 void addPreRegBankSelect() override;
894 bool addRegBankSelect() override;
895 void addPreGlobalInstructionSelect() override;
896 bool addGlobalInstructionSelect() override;
897 void addFastRegAlloc() override;
898 void addOptimizedRegAlloc() override;
899
900 FunctionPass *createSGPRAllocPass(bool Optimized);
901 FunctionPass *createVGPRAllocPass(bool Optimized);
902 FunctionPass *createRegAllocPass(bool Optimized) override;
903
904 bool addRegAssignAndRewriteFast() override;
905 bool addRegAssignAndRewriteOptimized() override;
906
907 void addPreRegAlloc() override;
908 bool addPreRewrite() override;
909 void addPostRegAlloc() override;
910 void addPreSched2() override;
911 void addPreEmitPass() override;
912};
913
914} // end anonymous namespace
915
917 : TargetPassConfig(TM, PM) {
918 // Exceptions and StackMaps are not supported, so these passes will never do
919 // anything.
922 // Garbage collection is not supported.
925}
926
930 else
932}
933
938 // ReassociateGEPs exposes more opportunities for SLSR. See
939 // the example in reassociate-geps-and-slsr.ll.
941 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
942 // EarlyCSE can reuse.
944 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
946 // NaryReassociate on GEPs creates redundant common expressions, so run
947 // EarlyCSE after it.
949}
950
953
957
958 // There is no reason to run these.
962
964 if (LowerCtorDtor)
966
969
970 // Function calls are not supported, so make sure we inline everything.
973
974 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
975 if (Arch == Triple::r600)
977
978 // Replace OpenCL enqueued block function pointers with global variables.
980
981 // Runs before PromoteAlloca so the latter can account for function uses
984 }
985
986 // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
987 // after their introduction
990
993
994 // Run atomic optimizer before Atomic Expand
999 }
1000
1002
1005
1008
1012 AAResults &AAR) {
1013 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1014 AAR.addAAResult(WrapperPass->getResult());
1015 }));
1016 }
1017
1019 // TODO: May want to move later or split into an early and late one.
1021 }
1022
1023 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1024 // have expanded.
1027 }
1028
1030
1031 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1032 // example, GVN can combine
1033 //
1034 // %0 = add %a, %b
1035 // %1 = add %b, %a
1036 //
1037 // and
1038 //
1039 // %0 = shl nsw %a, 2
1040 // %1 = shl %a, 2
1041 //
1042 // but EarlyCSE can do neither of them.
1045}
1046
1049 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1050 // analysis, and should be removed.
1052 }
1053
1057
1059 // This lowering has been placed after codegenprepare to take advantage of
1060 // address mode matching (which is why it isn't put with the LDS lowerings).
1061 // It could be placed anywhere before uniformity annotations (an analysis
1062 // that it changes by splitting up fat pointers into their components)
1063 // but has been put before switch lowering and CFG flattening so that those
1064 // passes can run on the more optimized control flow this pass creates in
1065 // many cases.
1066 //
1067 // FIXME: This should ideally be put after the LoadStoreVectorizer.
1068 // However, due to some annoying facts about ResourceUsageAnalysis,
1069 // (especially as exercised in the resource-usage-dead-function test),
1070 // we need all the function passes codegenprepare all the way through
1071 // said resource usage analysis to run on the call graph produced
1072 // before codegenprepare runs (because codegenprepare will knock some
1073 // nodes out of the graph, which leads to function-level passes not
1074 // being run on them, which causes crashes in the resource usage analysis).
1076 // In accordance with the above FIXME, manually force all the
1077 // function-level passes into a CGSCCPassManager.
1078 addPass(new DummyCGSCCPass());
1079 }
1080
1082
1085
1086 // LowerSwitch pass may introduce unreachable blocks that can
1087 // cause unexpected behavior for subsequent passes. Placing it
1088 // here seems better that these blocks would get cleaned up by
1089 // UnreachableBlockElim inserted next in the pass flow.
1091}
1092
1096 return false;
1097}
1098
1101 return false;
1102}
1103
1105 // Do nothing. GC is not supported.
1106 return false;
1107}
1108
1111 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1113 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1114 if (ST.shouldClusterStores())
1115 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1116 return DAG;
1117}
1118
1120 BumpPtrAllocator &Allocator, const Function &F,
1121 const TargetSubtargetInfo *STI) const {
1122 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1123 Allocator, F, static_cast<const R600Subtarget *>(STI));
1124}
1125
1126//===----------------------------------------------------------------------===//
1127// GCN Pass Setup
1128//===----------------------------------------------------------------------===//
1129
1130ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1131 MachineSchedContext *C) const {
1132 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1133 if (ST.enableSIScheduler())
1135
1138
1140}
1141
1142bool GCNPassConfig::addPreISel() {
1144
1145 if (TM->getOptLevel() > CodeGenOptLevel::None)
1147
1148 if (TM->getOptLevel() > CodeGenOptLevel::None)
1149 addPass(createSinkingPass());
1150
1151 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1152 // regions formed by them.
1156 addPass(createFixIrreduciblePass());
1157 addPass(createUnifyLoopExitsPass());
1158 }
1159 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1160 }
1164 // TODO: Move this right after structurizeCFG to avoid extra divergence
1165 // analysis. This depends on stopping SIAnnotateControlFlow from making
1166 // control flow modifications.
1168 }
1169 addPass(createLCSSAPass());
1170
1171 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1172 addPass(&AMDGPUPerfHintAnalysisID);
1173
1174 return false;
1175}
1176
1177void GCNPassConfig::addMachineSSAOptimization() {
1179
1180 // We want to fold operands after PeepholeOptimizer has run (or as part of
1181 // it), because it will eliminate extra copies making it easier to fold the
1182 // real source operand. We want to eliminate dead instructions after, so that
1183 // we see fewer uses of the copies. We then need to clean up the dead
1184 // instructions leftover after the operands are folded as well.
1185 //
1186 // XXX - Can we get away without running DeadMachineInstructionElim again?
1187 addPass(&SIFoldOperandsID);
1188 if (EnableDPPCombine)
1189 addPass(&GCNDPPCombineID);
1190 addPass(&SILoadStoreOptimizerID);
1191 if (isPassEnabled(EnableSDWAPeephole)) {
1192 addPass(&SIPeepholeSDWAID);
1193 addPass(&EarlyMachineLICMID);
1194 addPass(&MachineCSEID);
1195 addPass(&SIFoldOperandsID);
1196 }
1199}
1200
1201bool GCNPassConfig::addILPOpts() {
1203 addPass(&EarlyIfConverterID);
1204
1206 return false;
1207}
1208
1209bool GCNPassConfig::addInstSelector() {
1211 addPass(&SIFixSGPRCopiesID);
1212 addPass(createSILowerI1CopiesPass());
1213 return false;
1214}
1215
1216bool GCNPassConfig::addIRTranslator() {
1217 addPass(new IRTranslator(getOptLevel()));
1218 return false;
1219}
1220
1221void GCNPassConfig::addPreLegalizeMachineIR() {
1222 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1223 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1224 addPass(new Localizer());
1225}
1226
1227bool GCNPassConfig::addLegalizeMachineIR() {
1228 addPass(new Legalizer());
1229 return false;
1230}
1231
1232void GCNPassConfig::addPreRegBankSelect() {
1233 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1234 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1236}
1237
1238bool GCNPassConfig::addRegBankSelect() {
1239 addPass(new AMDGPURegBankSelect());
1240 return false;
1241}
1242
1243void GCNPassConfig::addPreGlobalInstructionSelect() {
1244 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1245 addPass(createAMDGPURegBankCombiner(IsOptNone));
1246}
1247
1248bool GCNPassConfig::addGlobalInstructionSelect() {
1249 addPass(new InstructionSelect(getOptLevel()));
1250 return false;
1251}
1252
1253void GCNPassConfig::addPreRegAlloc() {
1254 if (LateCFGStructurize) {
1256 }
1257}
1258
1259void GCNPassConfig::addFastRegAlloc() {
1260 // FIXME: We have to disable the verifier here because of PHIElimination +
1261 // TwoAddressInstructions disabling it.
1262
1263 // This must be run immediately after phi elimination and before
1264 // TwoAddressInstructions, otherwise the processing of the tied operand of
1265 // SI_ELSE will introduce a copy of the tied operand source after the else.
1267
1269
1271}
1272
1273void GCNPassConfig::addOptimizedRegAlloc() {
1274 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1275 // instructions that cause scheduling barriers.
1277
1278 if (OptExecMaskPreRA)
1280
1283
1284 if (isPassEnabled(EnablePreRAOptimizations))
1286
1287 // This is not an essential optimization and it has a noticeable impact on
1288 // compilation time, so we only enable it from O2.
1289 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1291
1292 // FIXME: when an instruction has a Killed operand, and the instruction is
1293 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1294 // the register in LiveVariables, this would trigger a failure in verifier,
1295 // we should fix it and enable the verifier.
1296 if (OptVGPRLiveRange)
1298 // This must be run immediately after phi elimination and before
1299 // TwoAddressInstructions, otherwise the processing of the tied operand of
1300 // SI_ELSE will introduce a copy of the tied operand source after the else.
1302
1303 if (EnableDCEInRA)
1305
1307}
1308
1309bool GCNPassConfig::addPreRewrite() {
1310 addPass(&SILowerWWMCopiesID);
1312 addPass(&GCNNSAReassignID);
1313 return true;
1314}
1315
1316FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1317 // Initialize the global default.
1318 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1319 initializeDefaultSGPRRegisterAllocatorOnce);
1320
1321 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1322 if (Ctor != useDefaultRegisterAllocator)
1323 return Ctor();
1324
1325 if (Optimized)
1326 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1327
1328 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1329}
1330
1331FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1332 // Initialize the global default.
1333 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1334 initializeDefaultVGPRRegisterAllocatorOnce);
1335
1336 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1337 if (Ctor != useDefaultRegisterAllocator)
1338 return Ctor();
1339
1340 if (Optimized)
1341 return createGreedyVGPRRegisterAllocator();
1342
1343 return createFastVGPRRegisterAllocator();
1344}
1345
1346FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1347 llvm_unreachable("should not be used");
1348}
1349
1351 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1352
1353bool GCNPassConfig::addRegAssignAndRewriteFast() {
1354 if (!usingDefaultRegAlloc())
1356
1357 addPass(&GCNPreRALongBranchRegID);
1358
1359 addPass(createSGPRAllocPass(false));
1360
1361 // Equivalent of PEI for SGPRs.
1362 addPass(&SILowerSGPRSpillsID);
1363 addPass(&SIPreAllocateWWMRegsID);
1364
1365 addPass(createVGPRAllocPass(false));
1366
1367 addPass(&SILowerWWMCopiesID);
1368 return true;
1369}
1370
1371bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1372 if (!usingDefaultRegAlloc())
1374
1375 addPass(&GCNPreRALongBranchRegID);
1376
1377 addPass(createSGPRAllocPass(true));
1378
1379 // Commit allocated register changes. This is mostly necessary because too
1380 // many things rely on the use lists of the physical registers, such as the
1381 // verifier. This is only necessary with allocators which use LiveIntervals,
1382 // since FastRegAlloc does the replacements itself.
1383 addPass(createVirtRegRewriter(false));
1384
1385 // Equivalent of PEI for SGPRs.
1386 addPass(&SILowerSGPRSpillsID);
1387 addPass(&SIPreAllocateWWMRegsID);
1388
1389 addPass(createVGPRAllocPass(true));
1390
1391 addPreRewrite();
1392 addPass(&VirtRegRewriterID);
1393
1395
1396 return true;
1397}
1398
1399void GCNPassConfig::addPostRegAlloc() {
1400 addPass(&SIFixVGPRCopiesID);
1401 if (getOptLevel() > CodeGenOptLevel::None)
1402 addPass(&SIOptimizeExecMaskingID);
1404}
1405
1406void GCNPassConfig::addPreSched2() {
1407 if (TM->getOptLevel() > CodeGenOptLevel::None)
1409 addPass(&SIPostRABundlerID);
1410}
1411
1412void GCNPassConfig::addPreEmitPass() {
1413 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1414 addPass(&GCNCreateVOPDID);
1415 addPass(createSIMemoryLegalizerPass());
1416 addPass(createSIInsertWaitcntsPass());
1417
1418 addPass(createSIModeRegisterPass());
1419
1420 if (getOptLevel() > CodeGenOptLevel::None)
1421 addPass(&SIInsertHardClausesID);
1422
1424 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1426 if (getOptLevel() > CodeGenOptLevel::None)
1427 addPass(&SIPreEmitPeepholeID);
1428 // The hazard recognizer that runs as part of the post-ra scheduler does not
1429 // guarantee to be able handle all hazards correctly. This is because if there
1430 // are multiple scheduling regions in a basic block, the regions are scheduled
1431 // bottom up, so when we begin to schedule a region we don't know what
1432 // instructions were emitted directly before it.
1433 //
1434 // Here we add a stand-alone hazard recognizer pass which can handle all
1435 // cases.
1436 addPass(&PostRAHazardRecognizerID);
1437
1440
1441 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1442 addPass(&AMDGPUInsertDelayAluID);
1443
1444 addPass(&BranchRelaxationPassID);
1445}
1446
1448 return new GCNPassConfig(*this, PM);
1449}
1450
1452 MachineFunction &MF) const {
1454 MF.getRegInfo().addDelegate(MFI);
1455}
1456
1458 BumpPtrAllocator &Allocator, const Function &F,
1459 const TargetSubtargetInfo *STI) const {
1460 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1461 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1462}
1463
1465 return new yaml::SIMachineFunctionInfo();
1466}
1467
1471 return new yaml::SIMachineFunctionInfo(
1472 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1473}
1474
1477 SMDiagnostic &Error, SMRange &SourceRange) const {
1478 const yaml::SIMachineFunctionInfo &YamlMFI =
1479 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1480 MachineFunction &MF = PFS.MF;
1482 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1483
1484 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1485 return true;
1486
1487 if (MFI->Occupancy == 0) {
1488 // Fixup the subtarget dependent default value.
1489 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1490 }
1491
1492 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1493 Register TempReg;
1494 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1495 SourceRange = RegName.SourceRange;
1496 return true;
1497 }
1498 RegVal = TempReg;
1499
1500 return false;
1501 };
1502
1503 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1504 Register &RegVal) {
1505 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1506 };
1507
1508 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1509 return true;
1510
1511 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1512 return true;
1513
1514 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1515 MFI->LongBranchReservedReg))
1516 return true;
1517
1518 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1519 // Create a diagnostic for a the register string literal.
1520 const MemoryBuffer &Buffer =
1521 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1522 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1523 RegName.Value.size(), SourceMgr::DK_Error,
1524 "incorrect register class for field", RegName.Value,
1525 std::nullopt, std::nullopt);
1526 SourceRange = RegName.SourceRange;
1527 return true;
1528 };
1529
1530 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1531 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1532 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1533 return true;
1534
1535 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1536 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1537 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1538 }
1539
1540 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1541 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1542 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1543 }
1544
1545 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1546 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1547 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1548 }
1549
1550 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1551 Register ParsedReg;
1552 if (parseRegister(YamlReg, ParsedReg))
1553 return true;
1554
1555 MFI->reserveWWMRegister(ParsedReg);
1556 }
1557
1558 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1559 const TargetRegisterClass &RC,
1560 ArgDescriptor &Arg, unsigned UserSGPRs,
1561 unsigned SystemSGPRs) {
1562 // Skip parsing if it's not present.
1563 if (!A)
1564 return false;
1565
1566 if (A->IsRegister) {
1567 Register Reg;
1568 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1569 SourceRange = A->RegisterName.SourceRange;
1570 return true;
1571 }
1572 if (!RC.contains(Reg))
1573 return diagnoseRegisterClass(A->RegisterName);
1575 } else
1576 Arg = ArgDescriptor::createStack(A->StackOffset);
1577 // Check and apply the optional mask.
1578 if (A->Mask)
1579 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1580
1581 MFI->NumUserSGPRs += UserSGPRs;
1582 MFI->NumSystemSGPRs += SystemSGPRs;
1583 return false;
1584 };
1585
1586 if (YamlMFI.ArgInfo &&
1587 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1588 AMDGPU::SGPR_128RegClass,
1589 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1590 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1591 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1592 2, 0) ||
1593 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1594 MFI->ArgInfo.QueuePtr, 2, 0) ||
1595 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1596 AMDGPU::SReg_64RegClass,
1597 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1598 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1599 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1600 2, 0) ||
1601 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1602 AMDGPU::SReg_64RegClass,
1603 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1604 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1605 AMDGPU::SGPR_32RegClass,
1606 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1607 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1608 AMDGPU::SGPR_32RegClass,
1609 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1610 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1611 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1612 0, 1) ||
1613 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1614 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1615 0, 1) ||
1616 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1617 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1618 0, 1) ||
1619 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1620 AMDGPU::SGPR_32RegClass,
1621 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1622 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1623 AMDGPU::SGPR_32RegClass,
1624 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1625 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1626 AMDGPU::SReg_64RegClass,
1627 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1628 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1629 AMDGPU::SReg_64RegClass,
1630 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1631 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1632 AMDGPU::VGPR_32RegClass,
1633 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1634 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1635 AMDGPU::VGPR_32RegClass,
1636 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1637 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1638 AMDGPU::VGPR_32RegClass,
1639 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1640 return true;
1641
1642 if (ST.hasIEEEMode())
1643 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1644 if (ST.hasDX10ClampMode())
1645 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1646
1647 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1648 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1651 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1654
1661
1662 return false;
1663}
static cl::opt< bool > EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(true))
This is the AMGPU address space based alias analysis pass.
static cl::opt< bool > EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc"))
static cl::opt< bool, true > EnableLowerModuleLDS("amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden)
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableLowerKernelArguments("amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-iterative-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
static cl::opt< bool > EnableImageIntrinsicOptimizer("amdgpu-enable-image-intrinsic-optimizer", cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableSIModeRegisterPass("amdgpu-mode-register", cl::desc("Enable mode register pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableDPPCombine("amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), cl::init(true))
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-iterative-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
static cl::opt< bool > EnableSetWavePriority("amdgpu-set-wave-priority", cl::desc("Adjust wave priority"), cl::init(false), cl::Hidden)
static cl::opt< bool > LowerCtorDtor("amdgpu-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), cl::init(true), cl::Hidden)
static cl::opt< bool, true > DisableStructurizer("amdgpu-disable-structurizer", cl::desc("Disable structurizer for experiments; produces unusable code"), cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden)
static cl::opt< bool > OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true))
static cl::opt< bool > EnablePromoteKernelArguments("amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true))
static cl::opt< bool > EnableRewritePartialRegUses("amdgpu-enable-rewrite-partial-reg-uses", cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", createGCNMaxILPMachineScheduler)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
static Reloc::Model getEffectiveRelocModel(std::optional< Reloc::Model > RM)
static cl::opt< bool > EnableStructurizerWorkarounds("amdgpu-enable-structurizer-workarounds", cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
static Expected< ScanOptions > parseAMDGPUAtomicOptimizerStrategy(StringRef Params)
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
static cl::opt< bool, true > LateCFGStructurize("amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden)
static cl::opt< bool > EnableHipStdPar("amdgpu-enable-hipstdpar", cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableInsertDelayAlu("amdgpu-enable-delay-alu", cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableMaxIlpSchedStrategy("amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), cl::Hidden, cl::init(false))
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
static cl::opt< bool > EnableLoopPrefetch("amdgpu-loop-prefetch", cl::desc("Enable loop data prefetch on AMDGPU"), cl::Hidden, cl::init(false))
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget()
static cl::opt< bool > EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst", cl::desc("Enable s_singleuse_vdst insertion"), cl::init(false), cl::Hidden)
static cl::opt< bool > RemoveIncompatibleFunctions("amdgpu-enable-remove-incompatible-functions", cl::Hidden, cl::desc("Enable removal of functions when they" "use features not supported by the target GPU"), cl::init(true))
static cl::opt< bool > EnableScalarIRPasses("amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableRegReassign("amdgpu-reassign-regs", cl::desc("Enable register reassign optimizations on gfx10+"), cl::init(true), cl::Hidden)
static cl::opt< bool > OptVGPRLiveRange("amdgpu-opt-vgpr-liverange", cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations", cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden)
static cl::opt< ScanOptions > AMDGPUAtomicOptimizerStrategy("amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), cl::init(ScanOptions::Iterative), cl::values(clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), clEnumValN(ScanOptions::Iterative, "Iterative", "Use Iterative approach for scan"), clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")))
static cl::opt< bool > EnableVOPD("amdgpu-enable-vopd", cl::desc("Enable VOPD, dual issue of VALU in wave32"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false))
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static MachineSchedRegistry GCNILPSchedRegistry("gcn-iterative-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static const char RegAllocOptNotSupportedMessage[]
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
Provides passes to inlining "always_inline" functions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This header provides classes for managing passes over SCCs of the call graph.
Provides analysis for continuously CSEing during GISel passes.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file defines the class GCNIterativeScheduler, which uses an iterative approach to find a best sc...
This file provides the interface for LLVM's Global Value Numbering pass which eliminates fully redund...
AcceleratorCodeSelection - Identify all functions reachable from a kernel, removing those that are un...
This file declares the IRTranslator pass.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static std::string computeDataLayout()
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
This header defines various interfaces for pass management in LLVM.
The AMDGPU TargetMachine interface definition for hw codegen targets.
Basic Register Allocator
This file describes the interface of the MachineFunctionPass responsible for assigning the generic vi...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Machine Scheduler interface.
static FunctionPass * useDefaultRegisterAllocator()
-regalloc=... command line option.
Target-Independent Code Generator Pass Configuration Options pass.
static std::unique_ptr< TargetLoweringObjectFile > createTLOF()
A manager for alias analyses.
void registerFunctionAnalysis()
Register a specific AA result.
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
Legacy wrapper pass to provide the AMDGPUAAResult object.
Analysis pass providing a never-invalidated alias analysis result.
AMDGPUTargetMachine & getAMDGPUTargetMachine() const
std::unique_ptr< CSEConfigBase > getCSEConfig() const override
Returns the CSEConfig object to use for the current optimization level.
ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override
Create an instance of ScheduleDAGInstrs to be run within the standard MachineScheduler pass for this ...
bool isPassEnabled(const cl::opt< bool > &Opt, CodeGenOptLevel Level=CodeGenOptLevel::Default) const
Check if a pass is enabled given Opt option.
bool addPreISel() override
Methods with trivial inline returns are convenient points in the common codegen pass pipeline where t...
bool addInstSelector() override
addInstSelector - This method should install an instruction selector pass, which converts from LLVM c...
bool addGCPasses() override
addGCPasses - Add late codegen passes that analyze code for garbage collection.
AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
void addIRPasses() override
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
void addCodeGenPrepare() override
Add pass to prepare the LLVM IR for code generation.
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override
getAddressSpaceForPseudoSourceKind - Given the kind of memory (e.g.
const TargetSubtargetInfo * getSubtargetImpl() const
void registerDefaultAliasAnalyses(AAManager &) override
Allow the target to register alias analyses with the AAManager for use with the new pass manager.
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const override
If the specified predicate checks whether a generic pointer falls within a specified address space,...
StringRef getFeatureString(const Function &F) const
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
void registerPassBuilderCallbacks(PassBuilder &PB, bool PopulateClassToPassNames) override
Allow the target to modify the pass pipeline.
StringRef getGPUName(const Function &F) const
unsigned getAssumedAddrSpace(const Value *V) const override
If the specified generic pointer could be assumed as a pointer to a specific address space,...
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:193
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:66
void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Definition: Constants.cpp:722
This pass is required by interprocedural register allocation.
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Tagged union holding either a T or a Error.
Definition: Error.h:474
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:264
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
void registerMachineRegisterInfoCallback(MachineFunction &MF) const override
bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const override
Parse out the target's MachineFunctionInfo from the YAML reprsentation.
yaml::MachineFunctionInfo * convertFuncInfoToYAML(const MachineFunction &MF) const override
Allocate and initialize an instance of the YAML representation of the MachineFunctionInfo.
yaml::MachineFunctionInfo * createDefaultFuncInfoYAML() const override
Allocate and return a default initialized instance of the YAML representation for the MachineFunction...
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL, bool JIT)
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
Pass to remove unused function declarations.
Definition: GlobalDCE.h:36
This pass is responsible for selecting generic machine instructions to target-specific instructions.
A pass that internalizes all functions and variables other than those that must be preserved accordin...
Definition: Internalize.h:34
This class describes a target machine that is implemented with the LLVM target-independent code gener...
This pass implements the localization mechanism described at the top of this file.
Definition: Localizer.h:43
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void addDelegate(Delegate *delegate)
MachineSchedRegistry provides a selection of available machine instruction schedulers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
Definition: MemoryBuffer.h:51
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Definition: MemoryBuffer.h:76
static const OptimizationLevel O0
Disable as many optimizations as possible.
unsigned getSpeedupLevel() const
static const OptimizationLevel O1
Optimize quickly without destroying debuggability.
This class provides access to building LLVM's passes.
Definition: PassBuilder.h:104
void registerPipelineEarlySimplificationEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:475
void registerPipelineStartEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:466
void registerCGSCCOptimizerLateEPCallback(const std::function< void(CGSCCPassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:445
void registerFullLinkTimeOptimizationLastEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:511
LLVM_ATTRIBUTE_MINSIZE void addPass(PassT &&Pass)
Definition: PassManager.h:249
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:37
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
RegisterPassParser class - Handle the addition of new machine passes.
RegisterRegAllocBase class - Track the registration of register allocators.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition: SourceMgr.h:281
Represents a location in source code.
Definition: SMLoc.h:23
Represents a range in source code.
Definition: SMLoc.h:48
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
unsigned getMainFileID() const
Definition: SourceMgr.h:132
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition: SourceMgr.h:125
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
bool consume_front(StringRef Prefix)
Returns true if this StringRef has the given prefix and removes that prefix.
Definition: StringRef.h:631
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
StringSwitch & Cases(StringLiteral S0, StringLiteral S1, T Value)
Definition: StringSwitch.h:90
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with.
Definition: TargetMachine.h:95
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
StringRef getTargetFeatureString() const
StringRef getTargetCPU() const
std::unique_ptr< const MCSubtargetInfo > STI
void resetTargetOptions(const Function &F) const
Reset the target options based on the function's attributes.
std::unique_ptr< const MCRegisterInfo > MRI
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
LLVMTargetMachine * TM
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
CodeGenOptLevel getOptLevel() const
virtual void addOptimizedRegAlloc()
addOptimizedRegAlloc - Add passes related to register allocation.
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
virtual void addFastRegAlloc()
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form.
void disablePass(AnalysisID PassID)
Allow the target to disable a specific standard pass by default.
AnalysisID addPass(AnalysisID PassID)
Utilities for targets to add passes to the pass manager.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:361
LLVM Value Representation.
Definition: Value.h:74
bool use_empty() const
Definition: Value.h:344
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
Interfaces for registering analysis passes, producing common pass manager configurations,...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
bool isEntryFunctionCC(CallingConv::ID CC)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:839
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ ReallyHidden
Definition: CommandLine.h:139
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
FunctionPass * createFlattenCFGPass()
void initializeSIFormMemoryClausesPass(PassRegistry &)
char & SIPreAllocateWWMRegsID
FunctionPass * createFastRegisterAllocator()
FastRegisterAllocation Pass - This pass register allocates as fast as possible.
char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
ImmutablePass * createAMDGPUAAWrapperPass()
char & PostRAHazardRecognizerID
PostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
FunctionPass * createAMDGPUSetWavePriorityPass()
void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &)
Pass * createLCSSAPass()
Definition: LCSSA.cpp:506
void initializeGCNCreateVOPDPass(PassRegistry &)
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
char & GCNPreRAOptimizationsID
char & GCLoweringID
GCLowering Pass - Used by gc.root to perform its default lowering operations.
void initializeGCNPreRAOptimizationsPass(PassRegistry &)
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &)
void initializeAMDGPUAttributorLegacyPass(PassRegistry &)
void initializeAMDGPUDAGToDAGISelPass(PassRegistry &)
char & SIPostRABundlerID
FunctionPass * createSIModeRegisterPass()
FunctionPass * createGreedyRegisterAllocator()
Greedy register allocation pass - This pass implements a global register allocator for optimized buil...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
ModulePass * createAMDGPULowerBufferFatPointersPass()
void initializeR600ClauseMergePassPass(PassRegistry &)
void initializeSIModeRegisterPass(PassRegistry &)
ModulePass * createAMDGPUCtorDtorLoweringLegacyPass()
void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &)
ModuleToFunctionPassAdaptor createModuleToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
Definition: PassManager.h:916
void initializeAMDGPULateCodeGenPreparePass(PassRegistry &)
void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &)
FunctionPass * createAMDGPUPreLegalizeCombiner(bool IsOptNone)
char & GCNRewritePartialRegUsesID
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry &)
std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition: Error.cpp:90
void initializeSIShrinkInstructionsPass(PassRegistry &)
char & SIFoldOperandsID
void initializeGCNPreRALongBranchRegPass(PassRegistry &)
char & SILoadStoreOptimizerID
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
FunctionPass * createNaryReassociatePass()
char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
char & PostRASchedulerID
PostRAScheduler - This pass performs post register allocation scheduling.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createVOPDPairingMutation()
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
void initializeSIPreEmitPeepholePass(PassRegistry &)
char & SILowerWWMCopiesID
void initializeSIFixVGPRCopiesPass(PassRegistry &)
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &)
std::unique_ptr< CSEConfigBase > getStandardCSEConfigForOpt(CodeGenOptLevel Level)
Definition: CSEInfo.cpp:79
Target & getTheR600Target()
The target for R600 GPUs.
char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &)
void initializeSILowerWWMCopiesPass(PassRegistry &)
void initializeGCNNSAReassignPass(PassRegistry &)
char & PostMachineSchedulerID
PostMachineScheduler - This pass schedules machine instructions postRA.
void initializeSIInsertWaitcntsPass(PassRegistry &)
char & AMDGPUInsertSingleUseVDSTID
Pass * createLICMPass()
Definition: LICM.cpp:379
ScheduleDAGMILive * createGenericSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
char & SIFormMemoryClausesID
void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &)
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &)
void initializeAMDGPURegBankCombinerPass(PassRegistry &)
void initializeSILoadStoreOptimizerPass(PassRegistry &)
void initializeSILateBranchLoweringPass(PassRegistry &)
void initializeSIPeepholeSDWAPass(PassRegistry &)
char & AMDGPUUnifyDivergentExitNodesID
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
char & ShadowStackGCLoweringID
ShadowStackGCLowering - Implements the custom lowering mechanism used by the shadow stack GC.
char & GCNNSAReassignID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
void initializeAMDGPUExternalAAWrapperPass(PassRegistry &)
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &)
char & AMDGPUPerfHintAnalysisID
char & SILowerSGPRSpillsID
CodeModel::Model getEffectiveCodeModel(std::optional< CodeModel::Model > CM, CodeModel::Model Default)
Helper method for getting the code model, returning Default if CM does not have a value.
char & SILateBranchLoweringPassID
char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
FunctionPass * createSinkingPass()
Definition: Sink.cpp:277
CGSCCToFunctionPassAdaptor createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false, bool NoRerun=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
FunctionPass * createSIShrinkInstructionsPass()
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &)
void initializeSIPostRABundlerPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry &)
Pass * createAMDGPUAttributorLegacyPass()
void initializeSIWholeQuadModePass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
FunctionPass * createLoopDataPrefetchPass()
FunctionPass * createAMDGPULowerKernelArgumentsPass()
char & AMDGPUInsertDelayAluID
Pass * createAMDGPUAnnotateKernelFeaturesPass()
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUPassConfig...
char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
char & SIOptimizeVGPRLiveRangeID
FunctionPass * createUnifyLoopExitsPass()
char & SIOptimizeExecMaskingPreRAID
FunctionPass * createFixIrreduciblePass()
char & FuncletLayoutID
This pass lays out funclets contiguously.
void initializeSIInsertHardClausesPass(PassRegistry &)
char & DetectDeadLanesID
This pass adds dead/undef flags after analyzing subregister lanes.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
void initializeSIAnnotateControlFlowPass(PassRegistry &)
ModulePass * createAMDGPUPrintfRuntimeBinding()
void initializeSIMemoryLegalizerPass(PassRegistry &)
Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &)
FunctionPass * createAMDGPUAnnotateUniformValues()
ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &)
FunctionPass * createAMDGPUPromoteAlloca()
FunctionPass * createSeparateConstOffsetFromGEPPass(bool LowerGEP=false)
char & EarlyIfConverterID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions.
char & SIPreEmitPeepholeID
ModulePass * createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *)
FunctionPass * createSILowerI1CopiesPass()
void initializeGCNRegPressurePrinterPass(PassRegistry &)
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &)
FunctionPass * createBasicRegisterAllocator()
BasicRegisterAllocation Pass - This pass implements a degenerate global register allocator using the ...
void initializeGlobalISel(PassRegistry &)
Initialize all passes linked into the GlobalISel library.
Definition: GlobalISel.cpp:17
void initializeSIPreAllocateWWMRegsPass(PassRegistry &)
ModulePass * createR600OpenCLImageTypeLoweringPass()
FunctionPass * createAMDGPUCodeGenPreparePass()
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
Target & getTheGCNTarget()
The target for GCN GPUs.
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &)
char & MachineCSEID
MachineCSE - This pass performs global CSE on machine instructions.
Definition: MachineCSE.cpp:165
char & GCNDPPCombineID
FunctionPass * createAMDGPURegBankCombiner(bool IsOptNone)
char & SIWholeQuadModeID
std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &)
char & LiveVariablesID
LiveVariables pass - This pass computes the set of blocks in which each variable is life and sets mac...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
FunctionPass * createGVNPass(bool NoMemDepAnalysis=false)
Create a legacy GVN pass.
Definition: GVN.cpp:3339
FunctionPass * createAMDGPURewriteUndefForPHILegacyPass()
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:87
void initializeSILowerSGPRSpillsPass(PassRegistry &)
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &)
char & SIInsertHardClausesID
FunctionPass * createAMDGPUMachineCFGStructurizerPass()
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &)
void initializeSIFixSGPRCopiesPass(PassRegistry &)
char & GCNCreateVOPDID
FunctionPass * createInferAddressSpacesPass(unsigned AddressSpace=~0u)
char & VirtRegRewriterID
VirtRegRewriter pass.
Definition: VirtRegMap.cpp:227
void initializeSILowerI1CopiesPass(PassRegistry &)
char & SILowerControlFlowID
FunctionPass * createLowerSwitchPass()
FunctionPass * createVirtRegRewriter(bool ClearVirtRegs=true)
Definition: VirtRegMap.cpp:645
void initializeR600VectorRegMergerPass(PassRegistry &)
ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
FunctionPass * createSIMemoryLegalizerPass()
void initializeSIFoldOperandsPass(PassRegistry &)
void initializeSILowerControlFlowPass(PassRegistry &)
char & SIPeepholeSDWAID
char & SIFixVGPRCopiesID
char & TwoAddressInstructionPassID
TwoAddressInstruction - This pass reduces two-address instructions to use two operands.
void initializeAMDGPURegBankSelectPass(PassRegistry &)
FunctionPass * createAtomicExpandLegacyPass()
AtomicExpandPass - At IR level this pass replace atomic instructions with __atomic_* library calls,...
MCRegisterInfo * createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour)
FunctionPass * createStraightLineStrengthReducePass()
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
void initializeAMDGPULowerBufferFatPointersPass(PassRegistry &)
FunctionPass * createSIInsertWaitcntsPass()
FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
Definition: EarlyCSE.cpp:1932
void initializeGCNDPPCombinePass(PassRegistry &)
char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions.
bool parseNamedRegisterReference(PerFunctionMIParsingState &PFS, Register &Reg, StringRef Src, SMDiagnostic &Error)
Definition: MIParser.cpp:3631
FunctionPass * createAMDGPULateCodeGenPreparePass()
char & AMDGPUMarkLastScratchLoadID
char & RenameIndependentSubregsID
This pass detects subregister lanes in a virtual register that are used independently of other lanes ...
std::unique_ptr< ScheduleDAGMutation > createAMDGPUExportClusteringDAGMutation()
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &)
void initializeAMDGPUInsertDelayAluPass(PassRegistry &)
char & SIOptimizeExecMaskingID
void initializeAMDGPUUnifyMetadataPass(PassRegistry &)
char & SIFixSGPRCopiesID
FunctionPass * createSIAnnotateControlFlowPass()
Create the annotation pass.
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
char & GCNPreRALongBranchRegID
void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &)
#define N
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ IEEE
IEEE-754 denormal numbers preserved.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
RegisterTargetMachine - Helper template for registering a target machine implementation,...
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
The llvm::once_flag structure.
Definition: Threading.h:68
Targets should override this in a way that mirrors the implementation of llvm::MachineFunctionInfo.
SmallVector< StringValue > WWMReservedRegs
std::optional< SIArgumentInfo > ArgInfo
A wrapper around std::string which contains a source range that's being set during parsing.