LLVM 19.0.0git
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUIGroupLP.h"
21#include "AMDGPUMacroFusion.h"
22#include "AMDGPURegBankSelect.h"
27#include "GCNSchedStrategy.h"
28#include "GCNVOPDUtils.h"
29#include "R600.h"
31#include "R600TargetMachine.h"
33#include "SIMachineScheduler.h"
45#include "llvm/CodeGen/Passes.h"
48#include "llvm/IR/IntrinsicsAMDGPU.h"
49#include "llvm/IR/PassManager.h"
55#include "llvm/Transforms/IPO.h"
65#include <optional>
66
67using namespace llvm;
68using namespace llvm::PatternMatch;
69
70namespace {
71class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
72public:
73 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
75};
76
77class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
78public:
79 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
81};
82
83static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
84 const TargetRegisterClass &RC) {
85 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
86}
87
88static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
89 const TargetRegisterClass &RC) {
90 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
91}
92
93
94/// -{sgpr|vgpr}-regalloc=... command line option.
95static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
96
97/// A dummy default pass factory indicates whether the register allocator is
98/// overridden on the command line.
99static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
100static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
101
102static SGPRRegisterRegAlloc
103defaultSGPRRegAlloc("default",
104 "pick SGPR register allocator based on -O option",
106
107static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
109SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
110 cl::desc("Register allocator to use for SGPRs"));
111
112static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
114VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
115 cl::desc("Register allocator to use for VGPRs"));
116
117
118static void initializeDefaultSGPRRegisterAllocatorOnce() {
119 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
120
121 if (!Ctor) {
122 Ctor = SGPRRegAlloc;
123 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
124 }
125}
126
127static void initializeDefaultVGPRRegisterAllocatorOnce() {
128 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
129
130 if (!Ctor) {
131 Ctor = VGPRRegAlloc;
132 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
133 }
134}
135
136static FunctionPass *createBasicSGPRRegisterAllocator() {
137 return createBasicRegisterAllocator(onlyAllocateSGPRs);
138}
139
140static FunctionPass *createGreedySGPRRegisterAllocator() {
141 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
142}
143
144static FunctionPass *createFastSGPRRegisterAllocator() {
145 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
146}
147
148static FunctionPass *createBasicVGPRRegisterAllocator() {
149 return createBasicRegisterAllocator(onlyAllocateVGPRs);
150}
151
152static FunctionPass *createGreedyVGPRRegisterAllocator() {
153 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
154}
155
156static FunctionPass *createFastVGPRRegisterAllocator() {
157 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
158}
159
160static SGPRRegisterRegAlloc basicRegAllocSGPR(
161 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
162static SGPRRegisterRegAlloc greedyRegAllocSGPR(
163 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
164
165static SGPRRegisterRegAlloc fastRegAllocSGPR(
166 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
167
168
169static VGPRRegisterRegAlloc basicRegAllocVGPR(
170 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
171static VGPRRegisterRegAlloc greedyRegAllocVGPR(
172 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
173
174static VGPRRegisterRegAlloc fastRegAllocVGPR(
175 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
176}
177
178static cl::opt<bool>
180 cl::desc("Run early if-conversion"),
181 cl::init(false));
182
183static cl::opt<bool>
184OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
185 cl::desc("Run pre-RA exec mask optimizations"),
186 cl::init(true));
187
188static cl::opt<bool>
189 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
190 cl::desc("Lower GPU ctor / dtors to globals on the device."),
191 cl::init(true), cl::Hidden);
192
193// Option to disable vectorizer for tests.
195 "amdgpu-load-store-vectorizer",
196 cl::desc("Enable load store vectorizer"),
197 cl::init(true),
198 cl::Hidden);
199
200// Option to control global loads scalarization
202 "amdgpu-scalarize-global-loads",
203 cl::desc("Enable global load scalarization"),
204 cl::init(true),
205 cl::Hidden);
206
207// Option to run internalize pass.
209 "amdgpu-internalize-symbols",
210 cl::desc("Enable elimination of non-kernel functions and unused globals"),
211 cl::init(false),
212 cl::Hidden);
213
214// Option to inline all early.
216 "amdgpu-early-inline-all",
217 cl::desc("Inline all functions early"),
218 cl::init(false),
219 cl::Hidden);
220
222 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
223 cl::desc("Enable removal of functions when they"
224 "use features not supported by the target GPU"),
225 cl::init(true));
226
228 "amdgpu-sdwa-peephole",
229 cl::desc("Enable SDWA peepholer"),
230 cl::init(true));
231
233 "amdgpu-dpp-combine",
234 cl::desc("Enable DPP combiner"),
235 cl::init(true));
236
237// Enable address space based alias analysis
239 cl::desc("Enable AMDGPU Alias Analysis"),
240 cl::init(true));
241
242// Option to run late CFG structurizer
244 "amdgpu-late-structurize",
245 cl::desc("Enable late CFG structurization"),
247 cl::Hidden);
248
249// Disable structurizer-based control-flow lowering in order to test convergence
250// control tokens. This should eventually be replaced by the wave-transform.
252 "amdgpu-disable-structurizer",
253 cl::desc("Disable structurizer for experiments; produces unusable code"),
255
256// Enable lib calls simplifications
258 "amdgpu-simplify-libcall",
259 cl::desc("Enable amdgpu library simplifications"),
260 cl::init(true),
261 cl::Hidden);
262
264 "amdgpu-ir-lower-kernel-arguments",
265 cl::desc("Lower kernel argument loads in IR pass"),
266 cl::init(true),
267 cl::Hidden);
268
270 "amdgpu-reassign-regs",
271 cl::desc("Enable register reassign optimizations on gfx10+"),
272 cl::init(true),
273 cl::Hidden);
274
276 "amdgpu-opt-vgpr-liverange",
277 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
278 cl::init(true), cl::Hidden);
279
281 "amdgpu-atomic-optimizer-strategy",
282 cl::desc("Select DPP or Iterative strategy for scan"),
283 cl::init(ScanOptions::Iterative),
285 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
286 clEnumValN(ScanOptions::Iterative, "Iterative",
287 "Use Iterative approach for scan"),
288 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
289
290// Enable Mode register optimization
292 "amdgpu-mode-register",
293 cl::desc("Enable mode register pass"),
294 cl::init(true),
295 cl::Hidden);
296
297// Enable GFX11.5+ s_singleuse_vdst insertion
298static cl::opt<bool>
299 EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
300 cl::desc("Enable s_singleuse_vdst insertion"),
301 cl::init(false), cl::Hidden);
302
303// Enable GFX11+ s_delay_alu insertion
304static cl::opt<bool>
305 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
306 cl::desc("Enable s_delay_alu insertion"),
307 cl::init(true), cl::Hidden);
308
309// Enable GFX11+ VOPD
310static cl::opt<bool>
311 EnableVOPD("amdgpu-enable-vopd",
312 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
313 cl::init(true), cl::Hidden);
314
315// Option is used in lit tests to prevent deadcoding of patterns inspected.
316static cl::opt<bool>
317EnableDCEInRA("amdgpu-dce-in-ra",
318 cl::init(true), cl::Hidden,
319 cl::desc("Enable machine DCE inside regalloc"));
320
321static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
322 cl::desc("Adjust wave priority"),
323 cl::init(false), cl::Hidden);
324
326 "amdgpu-scalar-ir-passes",
327 cl::desc("Enable scalar IR passes"),
328 cl::init(true),
329 cl::Hidden);
330
332 "amdgpu-enable-structurizer-workarounds",
333 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
334 cl::Hidden);
335
337 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
339 cl::Hidden);
340
342 "amdgpu-enable-pre-ra-optimizations",
343 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
344 cl::Hidden);
345
347 "amdgpu-enable-promote-kernel-arguments",
348 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
349 cl::Hidden, cl::init(true));
350
352 "amdgpu-enable-image-intrinsic-optimizer",
353 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
354 cl::Hidden);
355
356static cl::opt<bool>
357 EnableLoopPrefetch("amdgpu-loop-prefetch",
358 cl::desc("Enable loop data prefetch on AMDGPU"),
359 cl::Hidden, cl::init(false));
360
362 "amdgpu-enable-max-ilp-scheduling-strategy",
363 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
364 cl::Hidden, cl::init(false));
365
367 "amdgpu-enable-rewrite-partial-reg-uses",
368 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
369 cl::Hidden);
370
372 "amdgpu-enable-hipstdpar",
373 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
374 cl::Hidden);
375
377 // Register the target
380
455}
456
457static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
458 return std::make_unique<AMDGPUTargetObjectFile>();
459}
460
462 return new SIScheduleDAGMI(C);
463}
464
465static ScheduleDAGInstrs *
467 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
468 ScheduleDAGMILive *DAG =
469 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
470 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
471 if (ST.shouldClusterStores())
472 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
473 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
474 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
475 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
476 return DAG;
477}
478
479static ScheduleDAGInstrs *
481 ScheduleDAGMILive *DAG =
482 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
483 DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
484 return DAG;
485}
486
487static ScheduleDAGInstrs *
489 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
490 auto DAG = new GCNIterativeScheduler(C,
492 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
493 if (ST.shouldClusterStores())
494 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
495 return DAG;
496}
497
499 return new GCNIterativeScheduler(C,
501}
502
503static ScheduleDAGInstrs *
505 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
506 auto DAG = new GCNIterativeScheduler(C,
508 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
509 if (ST.shouldClusterStores())
510 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
511 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
512 return DAG;
513}
514
516SISchedRegistry("si", "Run SI's custom scheduler",
518
521 "Run GCN scheduler to maximize occupancy",
523
525 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
527
529 "gcn-iterative-max-occupancy-experimental",
530 "Run GCN scheduler to maximize occupancy (experimental)",
532
534 "gcn-iterative-minreg",
535 "Run GCN iterative scheduler for minimal register usage (experimental)",
537
539 "gcn-iterative-ilp",
540 "Run GCN iterative scheduler for ILP scheduling (experimental)",
542
544 if (TT.getArch() == Triple::r600) {
545 // 32-bit pointers.
546 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
547 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
548 }
549
550 // 32-bit private, local, and region pointers. 64-bit global, constant and
551 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
552 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
553 // (address space 7), and 128-bit non-integral buffer resourcees (address
554 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
555 // like getelementptr.
556 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
557 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
558 "v32:32-v48:64-v96:"
559 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
560 "G1-ni:7:8:9";
561}
562
565 if (!GPU.empty())
566 return GPU;
567
568 // Need to default to a target with flat support for HSA.
569 if (TT.getArch() == Triple::amdgcn)
570 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
571
572 return "r600";
573}
574
575static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
576 // The AMDGPU toolchain only supports generating shared objects, so we
577 // must always use PIC.
578 return Reloc::PIC_;
579}
580
582 StringRef CPU, StringRef FS,
583 const TargetOptions &Options,
584 std::optional<Reloc::Model> RM,
585 std::optional<CodeModel::Model> CM,
586 CodeGenOptLevel OptLevel)
589 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
590 TLOF(createTLOF(getTargetTriple())) {
591 initAsmInfo();
592 if (TT.getArch() == Triple::amdgcn) {
593 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
595 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
597 }
598}
599
604
606
608 Attribute GPUAttr = F.getFnAttribute("target-cpu");
609 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
610}
611
613 Attribute FSAttr = F.getFnAttribute("target-features");
614
615 return FSAttr.isValid() ? FSAttr.getValueAsString()
617}
618
619/// Predicate for Internalize pass.
620static bool mustPreserveGV(const GlobalValue &GV) {
621 if (const Function *F = dyn_cast<Function>(&GV))
622 return F->isDeclaration() || F->getName().starts_with("__asan_") ||
623 F->getName().starts_with("__sanitizer_") ||
624 AMDGPU::isEntryFunctionCC(F->getCallingConv());
625
627 return !GV.use_empty();
628}
629
632}
633
636 if (Params.empty())
638 Params.consume_front("strategy=");
639 auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
640 .Case("dpp", ScanOptions::DPP)
641 .Cases("iterative", "", ScanOptions::Iterative)
642 .Case("none", ScanOptions::None)
643 .Default(std::nullopt);
644 if (Result)
645 return *Result;
646 return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
647}
648
650 PassBuilder &PB, bool PopulateClassToPassNames) {
651
652#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
654
656 [](ModulePassManager &PM, OptimizationLevel Level) {
658 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
659 if (EnableHipStdPar)
661 });
662
664 [](ModulePassManager &PM, OptimizationLevel Level) {
666
667 if (Level == OptimizationLevel::O0)
668 return;
669
671
672 if (InternalizeSymbols) {
675 }
676
679 });
680
682 [](FunctionPassManager &FPM, OptimizationLevel Level) {
683 if (Level == OptimizationLevel::O0)
684 return;
685
689 });
690
692 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
693 if (Level == OptimizationLevel::O0)
694 return;
695
697
698 // Add promote kernel arguments pass to the opt pipeline right before
699 // infer address spaces which is needed to do actual address space
700 // rewriting.
701 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
704
705 // Add infer address spaces pass to the opt pipeline after inlining
706 // but before SROA to increase SROA opportunities.
708
709 // This should run after inlining to have any chance of doing
710 // anything, and before other cleanup optimizations.
712
713 if (Level != OptimizationLevel::O0) {
714 // Promote alloca to vector before SROA and loop unroll. If we
715 // manage to eliminate allocas before unroll we may choose to unroll
716 // less.
718 }
719
720 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
721 });
722
724 [this](ModulePassManager &PM, OptimizationLevel Level) {
725 // We want to support the -lto-partitions=N option as "best effort".
726 // For that, we need to lower LDS earlier in the pipeline before the
727 // module is partitioned for codegen.
730 });
731}
732
733int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
734 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
735 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
736 AddrSpace == AMDGPUAS::REGION_ADDRESS)
737 ? -1
738 : 0;
739}
740
742 unsigned DestAS) const {
743 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
745}
746
748 const auto *LD = dyn_cast<LoadInst>(V);
749 if (!LD)
751
752 // It must be a generic pointer loaded.
753 assert(V->getType()->isPointerTy() &&
754 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
755
756 const auto *Ptr = LD->getPointerOperand();
757 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
759 // For a generic pointer loaded from the constant memory, it could be assumed
760 // as a global pointer since the constant memory is only populated on the
761 // host side. As implied by the offload programming model, only global
762 // pointers could be referenced on the host side.
764}
765
766std::pair<const Value *, unsigned>
768 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
769 switch (II->getIntrinsicID()) {
770 case Intrinsic::amdgcn_is_shared:
771 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
772 case Intrinsic::amdgcn_is_private:
773 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
774 default:
775 break;
776 }
777 return std::pair(nullptr, -1);
778 }
779 // Check the global pointer predication based on
780 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
781 // the order of 'is_shared' and 'is_private' is not significant.
782 Value *Ptr;
783 if (match(
784 const_cast<Value *>(V),
785 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
786 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
787 m_Deferred(Ptr))))))
788 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
789
790 return std::pair(nullptr, -1);
791}
792
793unsigned
795 switch (Kind) {
805 }
807}
808
809//===----------------------------------------------------------------------===//
810// GCN Target Machine (SI+)
811//===----------------------------------------------------------------------===//
812
814 StringRef CPU, StringRef FS,
815 const TargetOptions &Options,
816 std::optional<Reloc::Model> RM,
817 std::optional<CodeModel::Model> CM,
818 CodeGenOptLevel OL, bool JIT)
819 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
820
823 StringRef GPU = getGPUName(F);
825
826 SmallString<128> SubtargetKey(GPU);
827 SubtargetKey.append(FS);
828
829 auto &I = SubtargetMap[SubtargetKey];
830 if (!I) {
831 // This needs to be done before we create a new subtarget since any
832 // creation will depend on the TM and the code generation flags on the
833 // function that reside in TargetOptions.
835 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
836 }
837
838 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
839
840 return I.get();
841}
842
845 return TargetTransformInfo(GCNTTIImpl(this, F));
846}
847
848//===----------------------------------------------------------------------===//
849// AMDGPU Pass Setup
850//===----------------------------------------------------------------------===//
851
852std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
854}
855
856namespace {
857
858class GCNPassConfig final : public AMDGPUPassConfig {
859public:
860 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
861 : AMDGPUPassConfig(TM, PM) {
862 // It is necessary to know the register usage of the entire call graph. We
863 // allow calls without EnableAMDGPUFunctionCalls if they are marked
864 // noinline, so this is always required.
865 setRequiresCodeGenSCCOrder(true);
866 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
867 }
868
869 GCNTargetMachine &getGCNTargetMachine() const {
870 return getTM<GCNTargetMachine>();
871 }
872
874 createMachineScheduler(MachineSchedContext *C) const override;
875
877 createPostMachineScheduler(MachineSchedContext *C) const override {
879 C, std::make_unique<PostGenericScheduler>(C),
880 /*RemoveKillFlags=*/true);
881 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
883 if (ST.shouldClusterStores())
885 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
886 DAG->addMutation(
887 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
888 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
890 return DAG;
891 }
892
893 bool addPreISel() override;
894 void addMachineSSAOptimization() override;
895 bool addILPOpts() override;
896 bool addInstSelector() override;
897 bool addIRTranslator() override;
898 void addPreLegalizeMachineIR() override;
899 bool addLegalizeMachineIR() override;
900 void addPreRegBankSelect() override;
901 bool addRegBankSelect() override;
902 void addPreGlobalInstructionSelect() override;
903 bool addGlobalInstructionSelect() override;
904 void addFastRegAlloc() override;
905 void addOptimizedRegAlloc() override;
906
907 FunctionPass *createSGPRAllocPass(bool Optimized);
908 FunctionPass *createVGPRAllocPass(bool Optimized);
909 FunctionPass *createRegAllocPass(bool Optimized) override;
910
911 bool addRegAssignAndRewriteFast() override;
912 bool addRegAssignAndRewriteOptimized() override;
913
914 void addPreRegAlloc() override;
915 bool addPreRewrite() override;
916 void addPostRegAlloc() override;
917 void addPreSched2() override;
918 void addPreEmitPass() override;
919};
920
921} // end anonymous namespace
922
924 : TargetPassConfig(TM, PM) {
925 // Exceptions and StackMaps are not supported, so these passes will never do
926 // anything.
929 // Garbage collection is not supported.
932}
933
937 else
939}
940
945 // ReassociateGEPs exposes more opportunities for SLSR. See
946 // the example in reassociate-geps-and-slsr.ll.
948 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
949 // EarlyCSE can reuse.
951 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
953 // NaryReassociate on GEPs creates redundant common expressions, so run
954 // EarlyCSE after it.
956}
957
960
964
965 // There is no reason to run these.
969
971 if (LowerCtorDtor)
973
976
977 // Function calls are not supported, so make sure we inline everything.
980
981 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
982 if (Arch == Triple::r600)
984
985 // Replace OpenCL enqueued block function pointers with global variables.
987
988 // Runs before PromoteAlloca so the latter can account for function uses
991 }
992
993 // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
994 // after their introduction
997
1000
1001 // Run atomic optimizer before Atomic Expand
1006 }
1007
1009
1012
1015
1019 AAResults &AAR) {
1020 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1021 AAR.addAAResult(WrapperPass->getResult());
1022 }));
1023 }
1024
1026 // TODO: May want to move later or split into an early and late one.
1028 }
1029
1030 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1031 // have expanded.
1034 }
1035
1037
1038 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1039 // example, GVN can combine
1040 //
1041 // %0 = add %a, %b
1042 // %1 = add %b, %a
1043 //
1044 // and
1045 //
1046 // %0 = shl nsw %a, 2
1047 // %1 = shl %a, 2
1048 //
1049 // but EarlyCSE can do neither of them.
1052}
1053
1056 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1057 // analysis, and should be removed.
1059 }
1060
1064
1066 // This lowering has been placed after codegenprepare to take advantage of
1067 // address mode matching (which is why it isn't put with the LDS lowerings).
1068 // It could be placed anywhere before uniformity annotations (an analysis
1069 // that it changes by splitting up fat pointers into their components)
1070 // but has been put before switch lowering and CFG flattening so that those
1071 // passes can run on the more optimized control flow this pass creates in
1072 // many cases.
1073 //
1074 // FIXME: This should ideally be put after the LoadStoreVectorizer.
1075 // However, due to some annoying facts about ResourceUsageAnalysis,
1076 // (especially as exercised in the resource-usage-dead-function test),
1077 // we need all the function passes codegenprepare all the way through
1078 // said resource usage analysis to run on the call graph produced
1079 // before codegenprepare runs (because codegenprepare will knock some
1080 // nodes out of the graph, which leads to function-level passes not
1081 // being run on them, which causes crashes in the resource usage analysis).
1083 // In accordance with the above FIXME, manually force all the
1084 // function-level passes into a CGSCCPassManager.
1085 addPass(new DummyCGSCCPass());
1086 }
1087
1089
1092
1093 // LowerSwitch pass may introduce unreachable blocks that can
1094 // cause unexpected behavior for subsequent passes. Placing it
1095 // here seems better that these blocks would get cleaned up by
1096 // UnreachableBlockElim inserted next in the pass flow.
1098}
1099
1103 return false;
1104}
1105
1108 return false;
1109}
1110
1112 // Do nothing. GC is not supported.
1113 return false;
1114}
1115
1118 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1120 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1121 if (ST.shouldClusterStores())
1122 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1123 return DAG;
1124}
1125
1127 BumpPtrAllocator &Allocator, const Function &F,
1128 const TargetSubtargetInfo *STI) const {
1129 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1130 Allocator, F, static_cast<const R600Subtarget *>(STI));
1131}
1132
1133//===----------------------------------------------------------------------===//
1134// GCN Pass Setup
1135//===----------------------------------------------------------------------===//
1136
1137ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1138 MachineSchedContext *C) const {
1139 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1140 if (ST.enableSIScheduler())
1142
1145
1147}
1148
1149bool GCNPassConfig::addPreISel() {
1151
1152 if (TM->getOptLevel() > CodeGenOptLevel::None)
1154
1155 if (TM->getOptLevel() > CodeGenOptLevel::None)
1156 addPass(createSinkingPass());
1157
1158 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1159 // regions formed by them.
1163 addPass(createFixIrreduciblePass());
1164 addPass(createUnifyLoopExitsPass());
1165 }
1166 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1167 }
1171 // TODO: Move this right after structurizeCFG to avoid extra divergence
1172 // analysis. This depends on stopping SIAnnotateControlFlow from making
1173 // control flow modifications.
1175 }
1176 addPass(createLCSSAPass());
1177
1178 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1179 addPass(&AMDGPUPerfHintAnalysisID);
1180
1181 return false;
1182}
1183
1184void GCNPassConfig::addMachineSSAOptimization() {
1186
1187 // We want to fold operands after PeepholeOptimizer has run (or as part of
1188 // it), because it will eliminate extra copies making it easier to fold the
1189 // real source operand. We want to eliminate dead instructions after, so that
1190 // we see fewer uses of the copies. We then need to clean up the dead
1191 // instructions leftover after the operands are folded as well.
1192 //
1193 // XXX - Can we get away without running DeadMachineInstructionElim again?
1194 addPass(&SIFoldOperandsID);
1195 if (EnableDPPCombine)
1196 addPass(&GCNDPPCombineID);
1197 addPass(&SILoadStoreOptimizerID);
1198 if (isPassEnabled(EnableSDWAPeephole)) {
1199 addPass(&SIPeepholeSDWAID);
1200 addPass(&EarlyMachineLICMID);
1201 addPass(&MachineCSEID);
1202 addPass(&SIFoldOperandsID);
1203 }
1206}
1207
1208bool GCNPassConfig::addILPOpts() {
1210 addPass(&EarlyIfConverterID);
1211
1213 return false;
1214}
1215
1216bool GCNPassConfig::addInstSelector() {
1218 addPass(&SIFixSGPRCopiesID);
1219 addPass(createSILowerI1CopiesPass());
1220 return false;
1221}
1222
1223bool GCNPassConfig::addIRTranslator() {
1224 addPass(new IRTranslator(getOptLevel()));
1225 return false;
1226}
1227
1228void GCNPassConfig::addPreLegalizeMachineIR() {
1229 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1230 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1231 addPass(new Localizer());
1232}
1233
1234bool GCNPassConfig::addLegalizeMachineIR() {
1235 addPass(new Legalizer());
1236 return false;
1237}
1238
1239void GCNPassConfig::addPreRegBankSelect() {
1240 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1241 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1243}
1244
1245bool GCNPassConfig::addRegBankSelect() {
1246 addPass(new AMDGPURegBankSelect());
1247 return false;
1248}
1249
1250void GCNPassConfig::addPreGlobalInstructionSelect() {
1251 bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
1252 addPass(createAMDGPURegBankCombiner(IsOptNone));
1253}
1254
1255bool GCNPassConfig::addGlobalInstructionSelect() {
1256 addPass(new InstructionSelect(getOptLevel()));
1257 return false;
1258}
1259
1260void GCNPassConfig::addPreRegAlloc() {
1261 if (LateCFGStructurize) {
1263 }
1264}
1265
1266void GCNPassConfig::addFastRegAlloc() {
1267 // FIXME: We have to disable the verifier here because of PHIElimination +
1268 // TwoAddressInstructions disabling it.
1269
1270 // This must be run immediately after phi elimination and before
1271 // TwoAddressInstructions, otherwise the processing of the tied operand of
1272 // SI_ELSE will introduce a copy of the tied operand source after the else.
1274
1276
1278}
1279
1280void GCNPassConfig::addOptimizedRegAlloc() {
1281 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1282 // instructions that cause scheduling barriers.
1284
1285 if (OptExecMaskPreRA)
1287
1290
1291 if (isPassEnabled(EnablePreRAOptimizations))
1293
1294 // This is not an essential optimization and it has a noticeable impact on
1295 // compilation time, so we only enable it from O2.
1296 if (TM->getOptLevel() > CodeGenOptLevel::Less)
1298
1299 // FIXME: when an instruction has a Killed operand, and the instruction is
1300 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1301 // the register in LiveVariables, this would trigger a failure in verifier,
1302 // we should fix it and enable the verifier.
1303 if (OptVGPRLiveRange)
1305 // This must be run immediately after phi elimination and before
1306 // TwoAddressInstructions, otherwise the processing of the tied operand of
1307 // SI_ELSE will introduce a copy of the tied operand source after the else.
1309
1310 if (EnableDCEInRA)
1312
1314}
1315
1316bool GCNPassConfig::addPreRewrite() {
1317 addPass(&SILowerWWMCopiesID);
1319 addPass(&GCNNSAReassignID);
1320 return true;
1321}
1322
1323FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1324 // Initialize the global default.
1325 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1326 initializeDefaultSGPRRegisterAllocatorOnce);
1327
1328 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1329 if (Ctor != useDefaultRegisterAllocator)
1330 return Ctor();
1331
1332 if (Optimized)
1333 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1334
1335 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1336}
1337
1338FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1339 // Initialize the global default.
1340 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1341 initializeDefaultVGPRRegisterAllocatorOnce);
1342
1343 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1344 if (Ctor != useDefaultRegisterAllocator)
1345 return Ctor();
1346
1347 if (Optimized)
1348 return createGreedyVGPRRegisterAllocator();
1349
1350 return createFastVGPRRegisterAllocator();
1351}
1352
1353FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1354 llvm_unreachable("should not be used");
1355}
1356
1358 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1359
1360bool GCNPassConfig::addRegAssignAndRewriteFast() {
1361 if (!usingDefaultRegAlloc())
1363
1364 addPass(&GCNPreRALongBranchRegID);
1365
1366 addPass(createSGPRAllocPass(false));
1367
1368 // Equivalent of PEI for SGPRs.
1369 addPass(&SILowerSGPRSpillsID);
1370 addPass(&SIPreAllocateWWMRegsID);
1371
1372 addPass(createVGPRAllocPass(false));
1373
1374 addPass(&SILowerWWMCopiesID);
1375 return true;
1376}
1377
1378bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1379 if (!usingDefaultRegAlloc())
1381
1382 addPass(&GCNPreRALongBranchRegID);
1383
1384 addPass(createSGPRAllocPass(true));
1385
1386 // Commit allocated register changes. This is mostly necessary because too
1387 // many things rely on the use lists of the physical registers, such as the
1388 // verifier. This is only necessary with allocators which use LiveIntervals,
1389 // since FastRegAlloc does the replacements itself.
1390 addPass(createVirtRegRewriter(false));
1391
1392 // Equivalent of PEI for SGPRs.
1393 addPass(&SILowerSGPRSpillsID);
1394 addPass(&SIPreAllocateWWMRegsID);
1395
1396 addPass(createVGPRAllocPass(true));
1397
1398 addPreRewrite();
1399 addPass(&VirtRegRewriterID);
1400
1402
1403 return true;
1404}
1405
1406void GCNPassConfig::addPostRegAlloc() {
1407 addPass(&SIFixVGPRCopiesID);
1408 if (getOptLevel() > CodeGenOptLevel::None)
1409 addPass(&SIOptimizeExecMaskingID);
1411}
1412
1413void GCNPassConfig::addPreSched2() {
1414 if (TM->getOptLevel() > CodeGenOptLevel::None)
1416 addPass(&SIPostRABundlerID);
1417}
1418
1419void GCNPassConfig::addPreEmitPass() {
1420 if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
1421 addPass(&GCNCreateVOPDID);
1422 addPass(createSIMemoryLegalizerPass());
1423 addPass(createSIInsertWaitcntsPass());
1424
1425 addPass(createSIModeRegisterPass());
1426
1427 if (getOptLevel() > CodeGenOptLevel::None)
1428 addPass(&SIInsertHardClausesID);
1429
1431 if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
1433 if (getOptLevel() > CodeGenOptLevel::None)
1434 addPass(&SIPreEmitPeepholeID);
1435 // The hazard recognizer that runs as part of the post-ra scheduler does not
1436 // guarantee to be able handle all hazards correctly. This is because if there
1437 // are multiple scheduling regions in a basic block, the regions are scheduled
1438 // bottom up, so when we begin to schedule a region we don't know what
1439 // instructions were emitted directly before it.
1440 //
1441 // Here we add a stand-alone hazard recognizer pass which can handle all
1442 // cases.
1443 addPass(&PostRAHazardRecognizerID);
1444
1447
1448 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
1449 addPass(&AMDGPUInsertDelayAluID);
1450
1451 addPass(&BranchRelaxationPassID);
1452}
1453
1455 return new GCNPassConfig(*this, PM);
1456}
1457
1459 MachineFunction &MF) const {
1461 MF.getRegInfo().addDelegate(MFI);
1462}
1463
1465 BumpPtrAllocator &Allocator, const Function &F,
1466 const TargetSubtargetInfo *STI) const {
1467 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1468 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1469}
1470
1472 return new yaml::SIMachineFunctionInfo();
1473}
1474
1478 return new yaml::SIMachineFunctionInfo(
1479 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1480}
1481
1484 SMDiagnostic &Error, SMRange &SourceRange) const {
1485 const yaml::SIMachineFunctionInfo &YamlMFI =
1486 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1487 MachineFunction &MF = PFS.MF;
1489 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1490
1491 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1492 return true;
1493
1494 if (MFI->Occupancy == 0) {
1495 // Fixup the subtarget dependent default value.
1496 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1497 }
1498
1499 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1500 Register TempReg;
1501 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1502 SourceRange = RegName.SourceRange;
1503 return true;
1504 }
1505 RegVal = TempReg;
1506
1507 return false;
1508 };
1509
1510 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1511 Register &RegVal) {
1512 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1513 };
1514
1515 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1516 return true;
1517
1518 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1519 return true;
1520
1521 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1522 MFI->LongBranchReservedReg))
1523 return true;
1524
1525 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1526 // Create a diagnostic for a the register string literal.
1527 const MemoryBuffer &Buffer =
1528 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1529 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1530 RegName.Value.size(), SourceMgr::DK_Error,
1531 "incorrect register class for field", RegName.Value,
1532 std::nullopt, std::nullopt);
1533 SourceRange = RegName.SourceRange;
1534 return true;
1535 };
1536
1537 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1538 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1539 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1540 return true;
1541
1542 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1543 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1544 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1545 }
1546
1547 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1548 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1549 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1550 }
1551
1552 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1553 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1554 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1555 }
1556
1557 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1558 Register ParsedReg;
1559 if (parseRegister(YamlReg, ParsedReg))
1560 return true;
1561
1562 MFI->reserveWWMRegister(ParsedReg);
1563 }
1564
1565 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1566 const TargetRegisterClass &RC,
1567 ArgDescriptor &Arg, unsigned UserSGPRs,
1568 unsigned SystemSGPRs) {
1569 // Skip parsing if it's not present.
1570 if (!A)
1571 return false;
1572
1573 if (A->IsRegister) {
1574 Register Reg;
1575 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1576 SourceRange = A->RegisterName.SourceRange;
1577 return true;
1578 }
1579 if (!RC.contains(Reg))
1580 return diagnoseRegisterClass(A->RegisterName);
1582 } else
1583 Arg = ArgDescriptor::createStack(A->StackOffset);
1584 // Check and apply the optional mask.
1585 if (A->Mask)
1586 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1587
1588 MFI->NumUserSGPRs += UserSGPRs;
1589 MFI->NumSystemSGPRs += SystemSGPRs;
1590 return false;
1591 };
1592
1593 if (YamlMFI.ArgInfo &&
1594 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1595 AMDGPU::SGPR_128RegClass,
1596 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1597 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1598 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1599 2, 0) ||
1600 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1601 MFI->ArgInfo.QueuePtr, 2, 0) ||
1602 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1603 AMDGPU::SReg_64RegClass,
1604 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1605 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1606 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1607 2, 0) ||
1608 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1609 AMDGPU::SReg_64RegClass,
1610 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1611 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1612 AMDGPU::SGPR_32RegClass,
1613 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1614 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1615 AMDGPU::SGPR_32RegClass,
1616 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1617 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1618 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1619 0, 1) ||
1620 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1621 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1622 0, 1) ||
1623 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1624 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1625 0, 1) ||
1626 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1627 AMDGPU::SGPR_32RegClass,
1628 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1629 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1630 AMDGPU::SGPR_32RegClass,
1631 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1632 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1633 AMDGPU::SReg_64RegClass,
1634 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1635 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1636 AMDGPU::SReg_64RegClass,
1637 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1638 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1639 AMDGPU::VGPR_32RegClass,
1640 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1641 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1642 AMDGPU::VGPR_32RegClass,
1643 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1644 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1645 AMDGPU::VGPR_32RegClass,
1646 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1647 return true;
1648
1649 if (ST.hasIEEEMode())
1650 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1651 if (ST.hasDX10ClampMode())
1652 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1653
1654 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1655 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1658 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1661
1668
1669 return false;
1670}
static cl::opt< bool > EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(true))
This is the AMGPU address space based alias analysis pass.
static cl::opt< bool > EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc"))
static cl::opt< bool, true > EnableLowerModuleLDS("amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden)
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableLowerKernelArguments("amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-iterative-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
static cl::opt< bool > EnableImageIntrinsicOptimizer("amdgpu-enable-image-intrinsic-optimizer", cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableSIModeRegisterPass("amdgpu-mode-register", cl::desc("Enable mode register pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableDPPCombine("amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), cl::init(true))
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-iterative-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
static cl::opt< bool > EnableSetWavePriority("amdgpu-set-wave-priority", cl::desc("Adjust wave priority"), cl::init(false), cl::Hidden)
static cl::opt< bool > LowerCtorDtor("amdgpu-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), cl::init(true), cl::Hidden)
static cl::opt< bool, true > DisableStructurizer("amdgpu-disable-structurizer", cl::desc("Disable structurizer for experiments; produces unusable code"), cl::location(AMDGPUTargetMachine::DisableStructurizer), cl::ReallyHidden)
static cl::opt< bool > OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true))
static cl::opt< bool > EnablePromoteKernelArguments("amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true))
static cl::opt< bool > EnableRewritePartialRegUses("amdgpu-enable-rewrite-partial-reg-uses", cl::desc("Enable rewrite partial reg uses pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", createGCNMaxILPMachineScheduler)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
static Reloc::Model getEffectiveRelocModel(std::optional< Reloc::Model > RM)
static cl::opt< bool > EnableStructurizerWorkarounds("amdgpu-enable-structurizer-workarounds", cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
static Expected< ScanOptions > parseAMDGPUAtomicOptimizerStrategy(StringRef Params)
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
static cl::opt< bool, true > LateCFGStructurize("amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden)
static cl::opt< bool > EnableHipStdPar("amdgpu-enable-hipstdpar", cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableInsertDelayAlu("amdgpu-enable-delay-alu", cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableMaxIlpSchedStrategy("amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), cl::Hidden, cl::init(false))
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
static cl::opt< bool > EnableLoopPrefetch("amdgpu-loop-prefetch", cl::desc("Enable loop data prefetch on AMDGPU"), cl::Hidden, cl::init(false))
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget()
static cl::opt< bool > EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst", cl::desc("Enable s_singleuse_vdst insertion"), cl::init(false), cl::Hidden)
static cl::opt< bool > RemoveIncompatibleFunctions("amdgpu-enable-remove-incompatible-functions", cl::Hidden, cl::desc("Enable removal of functions when they" "use features not supported by the target GPU"), cl::init(true))
static cl::opt< bool > EnableScalarIRPasses("amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableRegReassign("amdgpu-reassign-regs", cl::desc("Enable register reassign optimizations on gfx10+"), cl::init(true), cl::Hidden)
static cl::opt< bool > OptVGPRLiveRange("amdgpu-opt-vgpr-liverange", cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations", cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden)
static cl::opt< ScanOptions > AMDGPUAtomicOptimizerStrategy("amdgpu-atomic-optimizer-strategy", cl::desc("Select DPP or Iterative strategy for scan"), cl::init(ScanOptions::Iterative), cl::values(clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), clEnumValN(ScanOptions::Iterative, "Iterative", "Use Iterative approach for scan"), clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")))
static cl::opt< bool > EnableVOPD("amdgpu-enable-vopd", cl::desc("Enable VOPD, dual issue of VALU in wave32"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false))
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static MachineSchedRegistry GCNILPSchedRegistry("gcn-iterative-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static const char RegAllocOptNotSupportedMessage[]
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
Provides passes to inlining "always_inline" functions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This header provides classes for managing passes over SCCs of the call graph.
Provides analysis for continuously CSEing during GISel passes.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:693
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file defines the class GCNIterativeScheduler, which uses an iterative approach to find a best sc...
This file provides the interface for LLVM's Global Value Numbering pass which eliminates fully redund...
AcceleratorCodeSelection - Identify all functions reachable from a kernel, removing those that are un...
This file declares the IRTranslator pass.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static std::string computeDataLayout()
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
This header defines various interfaces for pass management in LLVM.
The AMDGPU TargetMachine interface definition for hw codegen targets.
Basic Register Allocator
This file describes the interface of the MachineFunctionPass responsible for assigning the generic vi...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Machine Scheduler interface.
static FunctionPass * useDefaultRegisterAllocator()
-regalloc=... command line option.
Target-Independent Code Generator Pass Configuration Options pass.
static std::unique_ptr< TargetLoweringObjectFile > createTLOF()
A manager for alias analyses.
void registerFunctionAnalysis()
Register a specific AA result.
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
Legacy wrapper pass to provide the AMDGPUAAResult object.
Analysis pass providing a never-invalidated alias analysis result.
AMDGPUTargetMachine & getAMDGPUTargetMachine() const
std::unique_ptr< CSEConfigBase > getCSEConfig() const override
Returns the CSEConfig object to use for the current optimization level.
ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override
Create an instance of ScheduleDAGInstrs to be run within the standard MachineScheduler pass for this ...
bool isPassEnabled(const cl::opt< bool > &Opt, CodeGenOptLevel Level=CodeGenOptLevel::Default) const
Check if a pass is enabled given Opt option.
bool addPreISel() override
Methods with trivial inline returns are convenient points in the common codegen pass pipeline where t...
bool addInstSelector() override
addInstSelector - This method should install an instruction selector pass, which converts from LLVM c...
bool addGCPasses() override
addGCPasses - Add late codegen passes that analyze code for garbage collection.
AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
void addIRPasses() override
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
void addCodeGenPrepare() override
Add pass to prepare the LLVM IR for code generation.
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override
getAddressSpaceForPseudoSourceKind - Given the kind of memory (e.g.
const TargetSubtargetInfo * getSubtargetImpl() const
void registerDefaultAliasAnalyses(AAManager &) override
Allow the target to register alias analyses with the AAManager for use with the new pass manager.
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const override
If the specified predicate checks whether a generic pointer falls within a specified address space,...
StringRef getFeatureString(const Function &F) const
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
void registerPassBuilderCallbacks(PassBuilder &PB, bool PopulateClassToPassNames) override
Allow the target to modify the pass pipeline.
StringRef getGPUName(const Function &F) const
unsigned getAssumedAddrSpace(const Value *V) const override
If the specified generic pointer could be assumed as a pointer to a specific address space,...
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:193
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:66
void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Definition: Constants.cpp:722
This pass is required by interprocedural register allocation.
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
Tagged union holding either a T or a Error.
Definition: Error.h:474
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:264
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
void registerMachineRegisterInfoCallback(MachineFunction &MF) const override
bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const override
Parse out the target's MachineFunctionInfo from the YAML reprsentation.
yaml::MachineFunctionInfo * convertFuncInfoToYAML(const MachineFunction &MF) const override
Allocate and initialize an instance of the YAML representation of the MachineFunctionInfo.
yaml::MachineFunctionInfo * createDefaultFuncInfoYAML() const override
Allocate and return a default initialized instance of the YAML representation for the MachineFunction...
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOptLevel OL, bool JIT)
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
Pass to remove unused function declarations.
Definition: GlobalDCE.h:36
This pass is responsible for selecting generic machine instructions to target-specific instructions.
A pass that internalizes all functions and variables other than those that must be preserved accordin...
Definition: Internalize.h:34
This class describes a target machine that is implemented with the LLVM target-independent code gener...
This pass implements the localization mechanism described at the top of this file.
Definition: Localizer.h:43
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void addDelegate(Delegate *delegate)
MachineSchedRegistry provides a selection of available machine instruction schedulers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
Definition: MemoryBuffer.h:51
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Definition: MemoryBuffer.h:76
static const OptimizationLevel O0
Disable as many optimizations as possible.
unsigned getSpeedupLevel() const
static const OptimizationLevel O1
Optimize quickly without destroying debuggability.
This class provides access to building LLVM's passes.
Definition: PassBuilder.h:104
void registerPipelineEarlySimplificationEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:475
void registerPipelineStartEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:466
void registerPeepholeEPCallback(const std::function< void(FunctionPassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:400
void registerCGSCCOptimizerLateEPCallback(const std::function< void(CGSCCPassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:445
void registerFullLinkTimeOptimizationLastEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:511
LLVM_ATTRIBUTE_MINSIZE void addPass(PassT &&Pass)
Definition: PassManager.h:249
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:37
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
RegisterPassParser class - Handle the addition of new machine passes.
RegisterRegAllocBase class - Track the registration of register allocators.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition: SourceMgr.h:281
Represents a location in source code.
Definition: SMLoc.h:23
Represents a range in source code.
Definition: SMLoc.h:48
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
unsigned getMainFileID() const
Definition: SourceMgr.h:132
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition: SourceMgr.h:125
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
bool consume_front(StringRef Prefix)
Returns true if this StringRef has the given prefix and removes that prefix.
Definition: StringRef.h:631
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
StringSwitch & Cases(StringLiteral S0, StringLiteral S1, T Value)
Definition: StringSwitch.h:90
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with.
Definition: TargetMachine.h:95
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
StringRef getTargetFeatureString() const
StringRef getTargetCPU() const
std::unique_ptr< const MCSubtargetInfo > STI
void resetTargetOptions(const Function &F) const
Reset the target options based on the function's attributes.
std::unique_ptr< const MCRegisterInfo > MRI
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
LLVMTargetMachine * TM
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
CodeGenOptLevel getOptLevel() const
virtual void addOptimizedRegAlloc()
addOptimizedRegAlloc - Add passes related to register allocation.
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
virtual void addFastRegAlloc()
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form.
void disablePass(AnalysisID PassID)
Allow the target to disable a specific standard pass by default.
AnalysisID addPass(AnalysisID PassID)
Utilities for targets to add passes to the pass manager.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:361
LLVM Value Representation.
Definition: Value.h:74
bool use_empty() const
Definition: Value.h:344
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
Interfaces for registering analysis passes, producing common pass manager configurations,...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
bool isEntryFunctionCC(CallingConv::ID CC)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:839
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ ReallyHidden
Definition: CommandLine.h:139
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:718
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:470
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
FunctionPass * createFlattenCFGPass()
void initializeSIFormMemoryClausesPass(PassRegistry &)
char & SIPreAllocateWWMRegsID
FunctionPass * createFastRegisterAllocator()
FastRegisterAllocation Pass - This pass register allocates as fast as possible.
char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
ImmutablePass * createAMDGPUAAWrapperPass()
char & PostRAHazardRecognizerID
PostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
FunctionPass * createAMDGPUSetWavePriorityPass()
void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &)
Pass * createLCSSAPass()
Definition: LCSSA.cpp:506
void initializeGCNCreateVOPDPass(PassRegistry &)
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
char & GCNPreRAOptimizationsID
char & GCLoweringID
GCLowering Pass - Used by gc.root to perform its default lowering operations.
void initializeGCNPreRAOptimizationsPass(PassRegistry &)
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &)
void initializeAMDGPUAttributorLegacyPass(PassRegistry &)
void initializeAMDGPUDAGToDAGISelPass(PassRegistry &)
char & SIPostRABundlerID
FunctionPass * createSIModeRegisterPass()
FunctionPass * createGreedyRegisterAllocator()
Greedy register allocation pass - This pass implements a global register allocator for optimized buil...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
ModulePass * createAMDGPULowerBufferFatPointersPass()
void initializeR600ClauseMergePassPass(PassRegistry &)
void initializeSIModeRegisterPass(PassRegistry &)
ModulePass * createAMDGPUCtorDtorLoweringLegacyPass()
void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &)
ModuleToFunctionPassAdaptor createModuleToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
Definition: PassManager.h:916
void initializeAMDGPULateCodeGenPreparePass(PassRegistry &)
void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &)
FunctionPass * createAMDGPUPreLegalizeCombiner(bool IsOptNone)
char & GCNRewritePartialRegUsesID
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry &)
std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition: Error.cpp:90
void initializeSIShrinkInstructionsPass(PassRegistry &)
char & SIFoldOperandsID
void initializeGCNPreRALongBranchRegPass(PassRegistry &)
char & SILoadStoreOptimizerID
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
FunctionPass * createNaryReassociatePass()
char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
char & PostRASchedulerID
PostRAScheduler - This pass performs post register allocation scheduling.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createVOPDPairingMutation()
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
void initializeSIPreEmitPeepholePass(PassRegistry &)
char & SILowerWWMCopiesID
void initializeSIFixVGPRCopiesPass(PassRegistry &)
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &)
std::unique_ptr< CSEConfigBase > getStandardCSEConfigForOpt(CodeGenOptLevel Level)
Definition: CSEInfo.cpp:79
Target & getTheR600Target()
The target for R600 GPUs.
char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &)
void initializeSILowerWWMCopiesPass(PassRegistry &)
void initializeGCNNSAReassignPass(PassRegistry &)
char & PostMachineSchedulerID
PostMachineScheduler - This pass schedules machine instructions postRA.
void initializeSIInsertWaitcntsPass(PassRegistry &)
char & AMDGPUInsertSingleUseVDSTID
Pass * createLICMPass()
Definition: LICM.cpp:379
ScheduleDAGMILive * createGenericSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
char & SIFormMemoryClausesID
void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &)
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &)
void initializeAMDGPURegBankCombinerPass(PassRegistry &)
void initializeSILoadStoreOptimizerPass(PassRegistry &)
void initializeSILateBranchLoweringPass(PassRegistry &)
void initializeSIPeepholeSDWAPass(PassRegistry &)
char & AMDGPUUnifyDivergentExitNodesID
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
char & ShadowStackGCLoweringID
ShadowStackGCLowering - Implements the custom lowering mechanism used by the shadow stack GC.
char & GCNNSAReassignID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
void initializeAMDGPUExternalAAWrapperPass(PassRegistry &)
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &)
char & AMDGPUPerfHintAnalysisID
char & SILowerSGPRSpillsID
CodeModel::Model getEffectiveCodeModel(std::optional< CodeModel::Model > CM, CodeModel::Model Default)
Helper method for getting the code model, returning Default if CM does not have a value.
char & SILateBranchLoweringPassID
char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
FunctionPass * createSinkingPass()
Definition: Sink.cpp:277
CGSCCToFunctionPassAdaptor createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false, bool NoRerun=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
FunctionPass * createSIShrinkInstructionsPass()
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &)
void initializeSIPostRABundlerPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry &)
Pass * createAMDGPUAttributorLegacyPass()
void initializeSIWholeQuadModePass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
FunctionPass * createLoopDataPrefetchPass()
FunctionPass * createAMDGPULowerKernelArgumentsPass()
char & AMDGPUInsertDelayAluID
Pass * createAMDGPUAnnotateKernelFeaturesPass()
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUPassConfig...
char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
char & SIOptimizeVGPRLiveRangeID
FunctionPass * createUnifyLoopExitsPass()
char & SIOptimizeExecMaskingPreRAID
FunctionPass * createFixIrreduciblePass()
char & FuncletLayoutID
This pass lays out funclets contiguously.
void initializeSIInsertHardClausesPass(PassRegistry &)
char & DetectDeadLanesID
This pass adds dead/undef flags after analyzing subregister lanes.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
void initializeSIAnnotateControlFlowPass(PassRegistry &)
ModulePass * createAMDGPUPrintfRuntimeBinding()
void initializeSIMemoryLegalizerPass(PassRegistry &)
Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &)
FunctionPass * createAMDGPUAnnotateUniformValues()
ModulePass * createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM=nullptr)
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &)
FunctionPass * createAMDGPUPromoteAlloca()
FunctionPass * createSeparateConstOffsetFromGEPPass(bool LowerGEP=false)
char & EarlyIfConverterID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions.
char & SIPreEmitPeepholeID
ModulePass * createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *)
FunctionPass * createSILowerI1CopiesPass()
void initializeGCNRegPressurePrinterPass(PassRegistry &)
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &)
FunctionPass * createBasicRegisterAllocator()
BasicRegisterAllocation Pass - This pass implements a degenerate global register allocator using the ...
void initializeGlobalISel(PassRegistry &)
Initialize all passes linked into the GlobalISel library.
Definition: GlobalISel.cpp:17
void initializeSIPreAllocateWWMRegsPass(PassRegistry &)
ModulePass * createR600OpenCLImageTypeLoweringPass()
FunctionPass * createAMDGPUCodeGenPreparePass()
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
Target & getTheGCNTarget()
The target for GCN GPUs.
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &)
char & MachineCSEID
MachineCSE - This pass performs global CSE on machine instructions.
Definition: MachineCSE.cpp:165
char & GCNDPPCombineID
FunctionPass * createAMDGPURegBankCombiner(bool IsOptNone)
char & SIWholeQuadModeID
std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI, bool ReorderWhileClustering=false)
If ReorderWhileClustering is set to true, no attempt will be made to reduce reordering due to store c...
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &)
char & LiveVariablesID
LiveVariables pass - This pass computes the set of blocks in which each variable is life and sets mac...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
FunctionPass * createGVNPass(bool NoMemDepAnalysis=false)
Create a legacy GVN pass.
Definition: GVN.cpp:3339
FunctionPass * createAMDGPURewriteUndefForPHILegacyPass()
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:87
void initializeSILowerSGPRSpillsPass(PassRegistry &)
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &)
char & SIInsertHardClausesID
FunctionPass * createAMDGPUMachineCFGStructurizerPass()
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &)
void initializeSIFixSGPRCopiesPass(PassRegistry &)
char & GCNCreateVOPDID
FunctionPass * createInferAddressSpacesPass(unsigned AddressSpace=~0u)
char & VirtRegRewriterID
VirtRegRewriter pass.
Definition: VirtRegMap.cpp:227
void initializeSILowerI1CopiesPass(PassRegistry &)
char & SILowerControlFlowID
FunctionPass * createLowerSwitchPass()
FunctionPass * createVirtRegRewriter(bool ClearVirtRegs=true)
Definition: VirtRegMap.cpp:645
void initializeR600VectorRegMergerPass(PassRegistry &)
ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
FunctionPass * createAMDGPUGlobalISelDivergenceLoweringPass()
FunctionPass * createSIMemoryLegalizerPass()
void initializeSIFoldOperandsPass(PassRegistry &)
void initializeSILowerControlFlowPass(PassRegistry &)
char & SIPeepholeSDWAID
char & SIFixVGPRCopiesID
char & TwoAddressInstructionPassID
TwoAddressInstruction - This pass reduces two-address instructions to use two operands.
void initializeAMDGPURegBankSelectPass(PassRegistry &)
FunctionPass * createAtomicExpandLegacyPass()
AtomicExpandPass - At IR level this pass replace atomic instructions with __atomic_* library calls,...
MCRegisterInfo * createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour)
FunctionPass * createStraightLineStrengthReducePass()
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
void initializeAMDGPULowerBufferFatPointersPass(PassRegistry &)
FunctionPass * createSIInsertWaitcntsPass()
FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
Definition: EarlyCSE.cpp:1932
void initializeGCNDPPCombinePass(PassRegistry &)
char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions.
bool parseNamedRegisterReference(PerFunctionMIParsingState &PFS, Register &Reg, StringRef Src, SMDiagnostic &Error)
Definition: MIParser.cpp:3631
FunctionPass * createAMDGPULateCodeGenPreparePass()
char & AMDGPUMarkLastScratchLoadID
char & RenameIndependentSubregsID
This pass detects subregister lanes in a virtual register that are used independently of other lanes ...
std::unique_ptr< ScheduleDAGMutation > createAMDGPUExportClusteringDAGMutation()
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &)
void initializeAMDGPUInsertDelayAluPass(PassRegistry &)
char & SIOptimizeExecMaskingID
void initializeAMDGPUUnifyMetadataPass(PassRegistry &)
char & SIFixSGPRCopiesID
FunctionPass * createSIAnnotateControlFlowPass()
Create the annotation pass.
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
char & GCNPreRALongBranchRegID
void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &)
#define N
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ IEEE
IEEE-754 denormal numbers preserved.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
RegisterTargetMachine - Helper template for registering a target machine implementation,...
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
The llvm::once_flag structure.
Definition: Threading.h:68
Targets should override this in a way that mirrors the implementation of llvm::MachineFunctionInfo.
SmallVector< StringValue > WWMReservedRegs
std::optional< SIArgumentInfo > ArgInfo
A wrapper around std::string which contains a source range that's being set during parsing.