LLVM 17.0.0git
AMDGPUTargetMachine.cpp
Go to the documentation of this file.
1//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// The AMDGPU target machine contains all of the hardware specific
11/// information needed to emit code for SI+ GPUs.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUTargetMachine.h"
16#include "AMDGPU.h"
17#include "AMDGPUAliasAnalysis.h"
20#include "AMDGPUIGroupLP.h"
21#include "AMDGPUMacroFusion.h"
22#include "AMDGPURegBankSelect.h"
26#include "GCNSchedStrategy.h"
27#include "GCNVOPDUtils.h"
28#include "R600.h"
30#include "R600TargetMachine.h"
32#include "SIMachineScheduler.h"
43#include "llvm/CodeGen/Passes.h"
46#include "llvm/IR/IntrinsicsAMDGPU.h"
47#include "llvm/IR/PassManager.h"
52#include "llvm/Transforms/IPO.h"
62#include <optional>
63
64using namespace llvm;
65using namespace llvm::PatternMatch;
66
67namespace {
68class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
69public:
70 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
72};
73
74class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
75public:
76 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
78};
79
80static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
81 const TargetRegisterClass &RC) {
82 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
83}
84
85static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
86 const TargetRegisterClass &RC) {
87 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
88}
89
90
91/// -{sgpr|vgpr}-regalloc=... command line option.
92static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
93
94/// A dummy default pass factory indicates whether the register allocator is
95/// overridden on the command line.
96static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
97static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
98
99static SGPRRegisterRegAlloc
100defaultSGPRRegAlloc("default",
101 "pick SGPR register allocator based on -O option",
103
104static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
106SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
107 cl::desc("Register allocator to use for SGPRs"));
108
109static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
111VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
112 cl::desc("Register allocator to use for VGPRs"));
113
114
115static void initializeDefaultSGPRRegisterAllocatorOnce() {
116 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
117
118 if (!Ctor) {
119 Ctor = SGPRRegAlloc;
120 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
121 }
122}
123
124static void initializeDefaultVGPRRegisterAllocatorOnce() {
125 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
126
127 if (!Ctor) {
128 Ctor = VGPRRegAlloc;
129 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
130 }
131}
132
133static FunctionPass *createBasicSGPRRegisterAllocator() {
134 return createBasicRegisterAllocator(onlyAllocateSGPRs);
135}
136
137static FunctionPass *createGreedySGPRRegisterAllocator() {
138 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
139}
140
141static FunctionPass *createFastSGPRRegisterAllocator() {
142 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
143}
144
145static FunctionPass *createBasicVGPRRegisterAllocator() {
146 return createBasicRegisterAllocator(onlyAllocateVGPRs);
147}
148
149static FunctionPass *createGreedyVGPRRegisterAllocator() {
150 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
151}
152
153static FunctionPass *createFastVGPRRegisterAllocator() {
154 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
155}
156
157static SGPRRegisterRegAlloc basicRegAllocSGPR(
158 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
159static SGPRRegisterRegAlloc greedyRegAllocSGPR(
160 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
161
162static SGPRRegisterRegAlloc fastRegAllocSGPR(
163 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
164
165
166static VGPRRegisterRegAlloc basicRegAllocVGPR(
167 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
168static VGPRRegisterRegAlloc greedyRegAllocVGPR(
169 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
170
171static VGPRRegisterRegAlloc fastRegAllocVGPR(
172 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
173}
174
176 "amdgpu-sroa",
177 cl::desc("Run SROA after promote alloca pass"),
179 cl::init(true));
180
181static cl::opt<bool>
183 cl::desc("Run early if-conversion"),
184 cl::init(false));
185
186static cl::opt<bool>
187OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
188 cl::desc("Run pre-RA exec mask optimizations"),
189 cl::init(true));
190
191// Option to disable vectorizer for tests.
193 "amdgpu-load-store-vectorizer",
194 cl::desc("Enable load store vectorizer"),
195 cl::init(true),
196 cl::Hidden);
197
198// Option to control global loads scalarization
200 "amdgpu-scalarize-global-loads",
201 cl::desc("Enable global load scalarization"),
202 cl::init(true),
203 cl::Hidden);
204
205// Option to run internalize pass.
207 "amdgpu-internalize-symbols",
208 cl::desc("Enable elimination of non-kernel functions and unused globals"),
209 cl::init(false),
210 cl::Hidden);
211
212// Option to inline all early.
214 "amdgpu-early-inline-all",
215 cl::desc("Inline all functions early"),
216 cl::init(false),
217 cl::Hidden);
218
220 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
221 cl::desc("Enable removal of functions when they"
222 "use features not supported by the target GPU"),
223 cl::init(true));
224
226 "amdgpu-sdwa-peephole",
227 cl::desc("Enable SDWA peepholer"),
228 cl::init(true));
229
231 "amdgpu-dpp-combine",
232 cl::desc("Enable DPP combiner"),
233 cl::init(true));
234
235// Enable address space based alias analysis
237 cl::desc("Enable AMDGPU Alias Analysis"),
238 cl::init(true));
239
240// Option to run late CFG structurizer
242 "amdgpu-late-structurize",
243 cl::desc("Enable late CFG structurization"),
245 cl::Hidden);
246
247// Enable lib calls simplifications
249 "amdgpu-simplify-libcall",
250 cl::desc("Enable amdgpu library simplifications"),
251 cl::init(true),
252 cl::Hidden);
253
255 "amdgpu-ir-lower-kernel-arguments",
256 cl::desc("Lower kernel argument loads in IR pass"),
257 cl::init(true),
258 cl::Hidden);
259
261 "amdgpu-reassign-regs",
262 cl::desc("Enable register reassign optimizations on gfx10+"),
263 cl::init(true),
264 cl::Hidden);
265
267 "amdgpu-opt-vgpr-liverange",
268 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
269 cl::init(true), cl::Hidden);
270
271// Enable atomic optimization
273 "amdgpu-atomic-optimizations",
274 cl::desc("Enable atomic optimizations"),
275 cl::init(false),
276 cl::Hidden);
277
278// Enable Mode register optimization
280 "amdgpu-mode-register",
281 cl::desc("Enable mode register pass"),
282 cl::init(true),
283 cl::Hidden);
284
285// Enable GFX11+ s_delay_alu insertion
286static cl::opt<bool>
287 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
288 cl::desc("Enable s_delay_alu insertion"),
289 cl::init(true), cl::Hidden);
290
291// Enable GFX11+ VOPD
292static cl::opt<bool>
293 EnableVOPD("amdgpu-enable-vopd",
294 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
295 cl::init(true), cl::Hidden);
296
297// Option is used in lit tests to prevent deadcoding of patterns inspected.
298static cl::opt<bool>
299EnableDCEInRA("amdgpu-dce-in-ra",
300 cl::init(true), cl::Hidden,
301 cl::desc("Enable machine DCE inside regalloc"));
302
303static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
304 cl::desc("Adjust wave priority"),
305 cl::init(false), cl::Hidden);
306
308 "amdgpu-scalar-ir-passes",
309 cl::desc("Enable scalar IR passes"),
310 cl::init(true),
311 cl::Hidden);
312
314 "amdgpu-enable-structurizer-workarounds",
315 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
316 cl::Hidden);
317
319 "amdgpu-enable-lds-replace-with-pointer",
320 cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
321 cl::Hidden);
322
324 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
326 cl::Hidden);
327
329 "amdgpu-enable-pre-ra-optimizations",
330 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
331 cl::Hidden);
332
334 "amdgpu-enable-promote-kernel-arguments",
335 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
336 cl::Hidden, cl::init(true));
337
339 "amdgpu-enable-max-ilp-scheduling-strategy",
340 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
341 cl::Hidden, cl::init(false));
342
344 // Register the target
347
420}
421
422static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
423 return std::make_unique<AMDGPUTargetObjectFile>();
424}
425
427 return new SIScheduleDAGMI(C);
428}
429
430static ScheduleDAGInstrs *
432 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
433 ScheduleDAGMILive *DAG =
434 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
435 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
436 if (ST.shouldClusterStores())
437 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
438 DAG->addMutation(createIGroupLPDAGMutation());
439 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
440 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
441 return DAG;
442}
443
444static ScheduleDAGInstrs *
446 ScheduleDAGMILive *DAG =
447 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
449 return DAG;
450}
451
452static ScheduleDAGInstrs *
454 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
455 auto DAG = new GCNIterativeScheduler(C,
457 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
458 if (ST.shouldClusterStores())
459 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
460 return DAG;
461}
462
464 return new GCNIterativeScheduler(C,
466}
467
468static ScheduleDAGInstrs *
470 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
471 auto DAG = new GCNIterativeScheduler(C,
473 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
474 if (ST.shouldClusterStores())
475 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
476 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
477 return DAG;
478}
479
481SISchedRegistry("si", "Run SI's custom scheduler",
483
486 "Run GCN scheduler to maximize occupancy",
488
490 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
492
494 "gcn-iterative-max-occupancy-experimental",
495 "Run GCN scheduler to maximize occupancy (experimental)",
497
499 "gcn-iterative-minreg",
500 "Run GCN iterative scheduler for minimal register usage (experimental)",
502
504 "gcn-iterative-ilp",
505 "Run GCN iterative scheduler for ILP scheduling (experimental)",
507
509 if (TT.getArch() == Triple::r600) {
510 // 32-bit pointers.
511 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
512 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
513 }
514
515 // 32-bit private, local, and region pointers. 64-bit global, constant and
516 // flat, non-integral buffer fat pointers.
517 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
518 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
519 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
520 "-ni:7";
521}
522
525 if (!GPU.empty())
526 return GPU;
527
528 // Need to default to a target with flat support for HSA.
529 if (TT.getArch() == Triple::amdgcn)
530 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
531
532 return "r600";
533}
534
535static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
536 // The AMDGPU toolchain only supports generating shared objects, so we
537 // must always use PIC.
538 return Reloc::PIC_;
539}
540
542 StringRef CPU, StringRef FS,
544 std::optional<Reloc::Model> RM,
545 std::optional<CodeModel::Model> CM,
546 CodeGenOpt::Level OptLevel)
549 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
550 TLOF(createTLOF(getTargetTriple())) {
551 initAsmInfo();
552 if (TT.getArch() == Triple::amdgcn) {
553 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
555 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
557 }
558}
559
563
565
567 Attribute GPUAttr = F.getFnAttribute("target-cpu");
568 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
569}
570
572 Attribute FSAttr = F.getFnAttribute("target-features");
573
574 return FSAttr.isValid() ? FSAttr.getValueAsString()
576}
577
578/// Predicate for Internalize pass.
579static bool mustPreserveGV(const GlobalValue &GV) {
580 if (const Function *F = dyn_cast<Function>(&GV))
581 return F->isDeclaration() || F->getName().startswith("__asan_") ||
582 F->getName().startswith("__sanitizer_") ||
583 AMDGPU::isEntryFunctionCC(F->getCallingConv());
584
586 return !GV.use_empty();
587}
588
591}
592
597 if (PassName == "amdgpu-propagate-attributes-late") {
599 return true;
600 }
601 if (PassName == "amdgpu-unify-metadata") {
603 return true;
604 }
605 if (PassName == "amdgpu-printf-runtime-binding") {
607 return true;
608 }
609 if (PassName == "amdgpu-always-inline") {
611 return true;
612 }
613 if (PassName == "amdgpu-replace-lds-use-with-pointer") {
615 return true;
616 }
617 if (PassName == "amdgpu-lower-module-lds") {
619 return true;
620 }
621 if (PassName == "amdgpu-lower-ctor-dtor") {
623 return true;
624 }
625 return false;
626 });
630 if (PassName == "amdgpu-simplifylib") {
632 return true;
633 }
634 if (PassName == "amdgpu-usenative") {
636 return true;
637 }
638 if (PassName == "amdgpu-promote-alloca") {
640 return true;
641 }
642 if (PassName == "amdgpu-promote-alloca-to-vector") {
644 return true;
645 }
646 if (PassName == "amdgpu-lower-kernel-attributes") {
648 return true;
649 }
650 if (PassName == "amdgpu-propagate-attributes-early") {
652 return true;
653 }
654 if (PassName == "amdgpu-promote-kernel-arguments") {
656 return true;
657 }
658 return false;
659 });
660
662 FAM.registerPass([&] { return AMDGPUAA(); });
663 });
664
666 if (AAName == "amdgpu-aa") {
668 return true;
669 }
670 return false;
671 });
672
674 [this](ModulePassManager &PM, OptimizationLevel Level) {
680 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
681 });
682
684 [this](ModulePassManager &PM, OptimizationLevel Level) {
685 if (Level == OptimizationLevel::O0)
686 return;
687
690
691 if (InternalizeSymbols) {
693 }
695 if (InternalizeSymbols) {
697 }
700 });
701
703 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
704 if (Level == OptimizationLevel::O0)
705 return;
706
708
709 // Add promote kernel arguments pass to the opt pipeline right before
710 // infer address spaces which is needed to do actual address space
711 // rewriting.
712 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
715
716 // Add infer address spaces pass to the opt pipeline after inlining
717 // but before SROA to increase SROA opportunities.
719
720 // This should run after inlining to have any chance of doing
721 // anything, and before other cleanup optimizations.
723
724 if (Level != OptimizationLevel::O0) {
725 // Promote alloca to vector before SROA and loop unroll. If we
726 // manage to eliminate allocas before unroll we may choose to unroll
727 // less.
729 }
730
731 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
732 });
733}
734
735int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
736 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
737 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
738 AddrSpace == AMDGPUAS::REGION_ADDRESS)
739 ? -1
740 : 0;
741}
742
744 unsigned DestAS) const {
745 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
747}
748
750 const auto *LD = dyn_cast<LoadInst>(V);
751 if (!LD)
753
754 // It must be a generic pointer loaded.
755 assert(V->getType()->isPointerTy() &&
756 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
757
758 const auto *Ptr = LD->getPointerOperand();
759 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
761 // For a generic pointer loaded from the constant memory, it could be assumed
762 // as a global pointer since the constant memory is only populated on the
763 // host side. As implied by the offload programming model, only global
764 // pointers could be referenced on the host side.
766}
767
768std::pair<const Value *, unsigned>
770 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
771 switch (II->getIntrinsicID()) {
772 case Intrinsic::amdgcn_is_shared:
773 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
774 case Intrinsic::amdgcn_is_private:
775 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
776 default:
777 break;
778 }
779 return std::pair(nullptr, -1);
780 }
781 // Check the global pointer predication based on
782 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
783 // the order of 'is_shared' and 'is_private' is not significant.
784 Value *Ptr;
785 if (match(
786 const_cast<Value *>(V),
787 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
788 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
789 m_Deferred(Ptr))))))
790 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
791
792 return std::pair(nullptr, -1);
793}
794
795unsigned
797 switch (Kind) {
807 }
809}
810
811//===----------------------------------------------------------------------===//
812// GCN Target Machine (SI+)
813//===----------------------------------------------------------------------===//
814
816 StringRef CPU, StringRef FS,
818 std::optional<Reloc::Model> RM,
819 std::optional<CodeModel::Model> CM,
820 CodeGenOpt::Level OL, bool JIT)
821 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
822
825 StringRef GPU = getGPUName(F);
827
828 SmallString<128> SubtargetKey(GPU);
829 SubtargetKey.append(FS);
830
831 auto &I = SubtargetMap[SubtargetKey];
832 if (!I) {
833 // This needs to be done before we create a new subtarget since any
834 // creation will depend on the TM and the code generation flags on the
835 // function that reside in TargetOptions.
837 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
838 }
839
840 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
841
842 return I.get();
843}
844
847 return TargetTransformInfo(GCNTTIImpl(this, F));
848}
849
850//===----------------------------------------------------------------------===//
851// AMDGPU Pass Setup
852//===----------------------------------------------------------------------===//
853
854std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
856}
857
858namespace {
859
860class GCNPassConfig final : public AMDGPUPassConfig {
861public:
862 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
863 : AMDGPUPassConfig(TM, PM) {
864 // It is necessary to know the register usage of the entire call graph. We
865 // allow calls without EnableAMDGPUFunctionCalls if they are marked
866 // noinline, so this is always required.
867 setRequiresCodeGenSCCOrder(true);
868 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
869 }
870
871 GCNTargetMachine &getGCNTargetMachine() const {
872 return getTM<GCNTargetMachine>();
873 }
874
876 createMachineScheduler(MachineSchedContext *C) const override;
877
879 createPostMachineScheduler(MachineSchedContext *C) const override {
881 C, std::make_unique<PostGenericScheduler>(C),
882 /*RemoveKillFlags=*/true);
883 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
885 if (ST.shouldClusterStores())
887 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
889 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
891 return DAG;
892 }
893
894 bool addPreISel() override;
895 void addMachineSSAOptimization() override;
896 bool addILPOpts() override;
897 bool addInstSelector() override;
898 bool addIRTranslator() override;
899 void addPreLegalizeMachineIR() override;
900 bool addLegalizeMachineIR() override;
901 void addPreRegBankSelect() override;
902 bool addRegBankSelect() override;
903 void addPreGlobalInstructionSelect() override;
904 bool addGlobalInstructionSelect() override;
905 void addFastRegAlloc() override;
906 void addOptimizedRegAlloc() override;
907
908 FunctionPass *createSGPRAllocPass(bool Optimized);
909 FunctionPass *createVGPRAllocPass(bool Optimized);
910 FunctionPass *createRegAllocPass(bool Optimized) override;
911
912 bool addRegAssignAndRewriteFast() override;
913 bool addRegAssignAndRewriteOptimized() override;
914
915 void addPreRegAlloc() override;
916 bool addPreRewrite() override;
917 void addPostRegAlloc() override;
918 void addPreSched2() override;
919 void addPreEmitPass() override;
920};
921
922} // end anonymous namespace
923
925 : TargetPassConfig(TM, PM) {
926 // Exceptions and StackMaps are not supported, so these passes will never do
927 // anything.
930 // Garbage collection is not supported.
933}
934
938 else
940}
941
945 // ReassociateGEPs exposes more opportunities for SLSR. See
946 // the example in reassociate-geps-and-slsr.ll.
948 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
949 // EarlyCSE can reuse.
951 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
953 // NaryReassociate on GEPs creates redundant common expressions, so run
954 // EarlyCSE after it.
956}
957
960
961 // There is no reason to run these.
965
968
969 // A call to propagate attributes pass in the backend in case opt was not run.
971
973
974 // Function calls are not supported, so make sure we inline everything.
977 // We need to add the barrier noop pass, otherwise adding the function
978 // inlining pass will cause all of the PassConfigs passes to be run
979 // one function at a time, which means if we have a module with two
980 // functions, then we will generate code for the first function
981 // without ever running any passes on the second.
983
984 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
987
988 // Replace OpenCL enqueued block function pointers with global variables.
990
991 // Can increase LDS used by kernel so runs before PromoteAlloca
993 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
994 // pass "amdgpu-lower-module-lds", and also it required to be run only if
995 // "amdgpu-lower-module-lds" pass is enabled.
998
1000 }
1001
1004
1006
1009
1010 if (EnableSROA)
1014
1018 AAResults &AAR) {
1019 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1020 AAR.addAAResult(WrapperPass->getResult());
1021 }));
1022 }
1023
1025 // TODO: May want to move later or split into an early and late one.
1027 }
1028 }
1029
1031
1032 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1033 // example, GVN can combine
1034 //
1035 // %0 = add %a, %b
1036 // %1 = add %b, %a
1037 //
1038 // and
1039 //
1040 // %0 = shl nsw %a, 2
1041 // %1 = shl %a, 2
1042 //
1043 // but EarlyCSE can do neither of them.
1046}
1047
1052
1054
1055 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1056 // analysis, and should be removed.
1058 }
1059
1063
1065
1068
1069 // LowerSwitch pass may introduce unreachable blocks that can
1070 // cause unexpected behavior for subsequent passes. Placing it
1071 // here seems better that these blocks would get cleaned up by
1072 // UnreachableBlockElim inserted next in the pass flow.
1074}
1075
1079 return false;
1080}
1081
1084 return false;
1085}
1086
1088 // Do nothing. GC is not supported.
1089 return false;
1090}
1091
1094 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1096 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1097 if (ST.shouldClusterStores())
1098 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1099 return DAG;
1100}
1101
1103 BumpPtrAllocator &Allocator, const Function &F,
1104 const TargetSubtargetInfo *STI) const {
1105 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1106 Allocator, F, static_cast<const R600Subtarget *>(STI));
1107}
1108
1109//===----------------------------------------------------------------------===//
1110// GCN Pass Setup
1111//===----------------------------------------------------------------------===//
1112
1113ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1114 MachineSchedContext *C) const {
1115 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1116 if (ST.enableSIScheduler())
1118
1121
1123}
1124
1125bool GCNPassConfig::addPreISel() {
1127
1128 if (TM->getOptLevel() > CodeGenOpt::None)
1130
1131 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1133 }
1134
1135 if (TM->getOptLevel() > CodeGenOpt::None)
1136 addPass(createSinkingPass());
1137
1138 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1139 // regions formed by them.
1141 if (!LateCFGStructurize) {
1143 addPass(createFixIrreduciblePass());
1144 addPass(createUnifyLoopExitsPass());
1145 }
1146 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1147 }
1149 if (!LateCFGStructurize) {
1151 // TODO: Move this right after structurizeCFG to avoid extra divergence
1152 // analysis. This depends on stopping SIAnnotateControlFlow from making
1153 // control flow modifications.
1155 }
1156 addPass(createLCSSAPass());
1157
1158 if (TM->getOptLevel() > CodeGenOpt::Less)
1159 addPass(&AMDGPUPerfHintAnalysisID);
1160
1161 return false;
1162}
1163
1164void GCNPassConfig::addMachineSSAOptimization() {
1166
1167 // We want to fold operands after PeepholeOptimizer has run (or as part of
1168 // it), because it will eliminate extra copies making it easier to fold the
1169 // real source operand. We want to eliminate dead instructions after, so that
1170 // we see fewer uses of the copies. We then need to clean up the dead
1171 // instructions leftover after the operands are folded as well.
1172 //
1173 // XXX - Can we get away without running DeadMachineInstructionElim again?
1174 addPass(&SIFoldOperandsID);
1175 if (EnableDPPCombine)
1176 addPass(&GCNDPPCombineID);
1177 addPass(&SILoadStoreOptimizerID);
1178 if (isPassEnabled(EnableSDWAPeephole)) {
1179 addPass(&SIPeepholeSDWAID);
1180 addPass(&EarlyMachineLICMID);
1181 addPass(&MachineCSEID);
1182 addPass(&SIFoldOperandsID);
1183 }
1186}
1187
1188bool GCNPassConfig::addILPOpts() {
1190 addPass(&EarlyIfConverterID);
1191
1193 return false;
1194}
1195
1196bool GCNPassConfig::addInstSelector() {
1198 addPass(&SIFixSGPRCopiesID);
1199 addPass(createSILowerI1CopiesPass());
1200 return false;
1201}
1202
1203bool GCNPassConfig::addIRTranslator() {
1204 addPass(new IRTranslator(getOptLevel()));
1205 return false;
1206}
1207
1208void GCNPassConfig::addPreLegalizeMachineIR() {
1209 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1210 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1211 addPass(new Localizer());
1212}
1213
1214bool GCNPassConfig::addLegalizeMachineIR() {
1215 addPass(new Legalizer());
1216 return false;
1217}
1218
1219void GCNPassConfig::addPreRegBankSelect() {
1220 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1221 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1222}
1223
1224bool GCNPassConfig::addRegBankSelect() {
1225 addPass(new AMDGPURegBankSelect());
1226 return false;
1227}
1228
1229void GCNPassConfig::addPreGlobalInstructionSelect() {
1230 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1231 addPass(createAMDGPURegBankCombiner(IsOptNone));
1232}
1233
1234bool GCNPassConfig::addGlobalInstructionSelect() {
1235 addPass(new InstructionSelect(getOptLevel()));
1236 return false;
1237}
1238
1239void GCNPassConfig::addPreRegAlloc() {
1240 if (LateCFGStructurize) {
1242 }
1243}
1244
1245void GCNPassConfig::addFastRegAlloc() {
1246 // FIXME: We have to disable the verifier here because of PHIElimination +
1247 // TwoAddressInstructions disabling it.
1248
1249 // This must be run immediately after phi elimination and before
1250 // TwoAddressInstructions, otherwise the processing of the tied operand of
1251 // SI_ELSE will introduce a copy of the tied operand source after the else.
1253
1256
1258}
1259
1260void GCNPassConfig::addOptimizedRegAlloc() {
1261 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1262 // instructions that cause scheduling barriers.
1265
1266 if (OptExecMaskPreRA)
1268
1269 if (isPassEnabled(EnablePreRAOptimizations))
1271
1272 // This is not an essential optimization and it has a noticeable impact on
1273 // compilation time, so we only enable it from O2.
1274 if (TM->getOptLevel() > CodeGenOpt::Less)
1276
1277 // FIXME: when an instruction has a Killed operand, and the instruction is
1278 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1279 // the register in LiveVariables, this would trigger a failure in verifier,
1280 // we should fix it and enable the verifier.
1281 if (OptVGPRLiveRange)
1283 // This must be run immediately after phi elimination and before
1284 // TwoAddressInstructions, otherwise the processing of the tied operand of
1285 // SI_ELSE will introduce a copy of the tied operand source after the else.
1287
1288 if (EnableDCEInRA)
1290
1292}
1293
1294bool GCNPassConfig::addPreRewrite() {
1296 addPass(&GCNNSAReassignID);
1297 return true;
1298}
1299
1300FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1301 // Initialize the global default.
1302 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1303 initializeDefaultSGPRRegisterAllocatorOnce);
1304
1305 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1306 if (Ctor != useDefaultRegisterAllocator)
1307 return Ctor();
1308
1309 if (Optimized)
1310 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1311
1312 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1313}
1314
1315FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1316 // Initialize the global default.
1317 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1318 initializeDefaultVGPRRegisterAllocatorOnce);
1319
1320 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1321 if (Ctor != useDefaultRegisterAllocator)
1322 return Ctor();
1323
1324 if (Optimized)
1325 return createGreedyVGPRRegisterAllocator();
1326
1327 return createFastVGPRRegisterAllocator();
1328}
1329
1330FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1331 llvm_unreachable("should not be used");
1332}
1333
1335 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1336
1337bool GCNPassConfig::addRegAssignAndRewriteFast() {
1338 if (!usingDefaultRegAlloc())
1340
1341 addPass(createSGPRAllocPass(false));
1342
1343 // Equivalent of PEI for SGPRs.
1344 addPass(&SILowerSGPRSpillsID);
1345
1346 addPass(createVGPRAllocPass(false));
1347 return true;
1348}
1349
1350bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1351 if (!usingDefaultRegAlloc())
1353
1354 addPass(createSGPRAllocPass(true));
1355
1356 // Commit allocated register changes. This is mostly necessary because too
1357 // many things rely on the use lists of the physical registers, such as the
1358 // verifier. This is only necessary with allocators which use LiveIntervals,
1359 // since FastRegAlloc does the replacements itself.
1360 addPass(createVirtRegRewriter(false));
1361
1362 // Equivalent of PEI for SGPRs.
1363 addPass(&SILowerSGPRSpillsID);
1364
1365 addPass(createVGPRAllocPass(true));
1366
1367 addPreRewrite();
1368 addPass(&VirtRegRewriterID);
1369
1370 return true;
1371}
1372
1373void GCNPassConfig::addPostRegAlloc() {
1374 addPass(&SIFixVGPRCopiesID);
1375 if (getOptLevel() > CodeGenOpt::None)
1376 addPass(&SIOptimizeExecMaskingID);
1378}
1379
1380void GCNPassConfig::addPreSched2() {
1381 if (TM->getOptLevel() > CodeGenOpt::None)
1383 addPass(&SIPostRABundlerID);
1384}
1385
1386void GCNPassConfig::addPreEmitPass() {
1387 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1388 addPass(&GCNCreateVOPDID);
1389 addPass(createSIMemoryLegalizerPass());
1390 addPass(createSIInsertWaitcntsPass());
1391
1392 addPass(createSIModeRegisterPass());
1393
1394 if (getOptLevel() > CodeGenOpt::None)
1395 addPass(&SIInsertHardClausesID);
1396
1398 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1400 if (getOptLevel() > CodeGenOpt::None)
1401 addPass(&SIPreEmitPeepholeID);
1402 // The hazard recognizer that runs as part of the post-ra scheduler does not
1403 // guarantee to be able handle all hazards correctly. This is because if there
1404 // are multiple scheduling regions in a basic block, the regions are scheduled
1405 // bottom up, so when we begin to schedule a region we don't know what
1406 // instructions were emitted directly before it.
1407 //
1408 // Here we add a stand-alone hazard recognizer pass which can handle all
1409 // cases.
1410 addPass(&PostRAHazardRecognizerID);
1411
1412 if (getOptLevel() > CodeGenOpt::Less)
1413 addPass(&AMDGPUReleaseVGPRsID);
1414
1415 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1416 addPass(&AMDGPUInsertDelayAluID);
1417
1418 addPass(&BranchRelaxationPassID);
1419}
1420
1422 return new GCNPassConfig(*this, PM);
1423}
1424
1426 BumpPtrAllocator &Allocator, const Function &F,
1427 const TargetSubtargetInfo *STI) const {
1428 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1429 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1430}
1431
1433 return new yaml::SIMachineFunctionInfo();
1434}
1435
1439 return new yaml::SIMachineFunctionInfo(
1440 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1441}
1442
1445 SMDiagnostic &Error, SMRange &SourceRange) const {
1446 const yaml::SIMachineFunctionInfo &YamlMFI =
1447 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1448 MachineFunction &MF = PFS.MF;
1450
1451 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1452 return true;
1453
1454 if (MFI->Occupancy == 0) {
1455 // Fixup the subtarget dependent default value.
1456 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1457 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1458 }
1459
1460 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1461 Register TempReg;
1462 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1463 SourceRange = RegName.SourceRange;
1464 return true;
1465 }
1466 RegVal = TempReg;
1467
1468 return false;
1469 };
1470
1471 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1472 Register &RegVal) {
1473 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1474 };
1475
1476 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1477 return true;
1478
1479 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1480 // Create a diagnostic for a the register string literal.
1481 const MemoryBuffer &Buffer =
1482 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1483 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1484 RegName.Value.size(), SourceMgr::DK_Error,
1485 "incorrect register class for field", RegName.Value,
1486 std::nullopt, std::nullopt);
1487 SourceRange = RegName.SourceRange;
1488 return true;
1489 };
1490
1491 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1492 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1493 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1494 return true;
1495
1496 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1497 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1498 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1499 }
1500
1501 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1502 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1503 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1504 }
1505
1506 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1507 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1508 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1509 }
1510
1511 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1512 Register ParsedReg;
1513 if (parseRegister(YamlReg, ParsedReg))
1514 return true;
1515
1516 MFI->reserveWWMRegister(ParsedReg);
1517 }
1518
1519 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1520 const TargetRegisterClass &RC,
1521 ArgDescriptor &Arg, unsigned UserSGPRs,
1522 unsigned SystemSGPRs) {
1523 // Skip parsing if it's not present.
1524 if (!A)
1525 return false;
1526
1527 if (A->IsRegister) {
1528 Register Reg;
1529 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1530 SourceRange = A->RegisterName.SourceRange;
1531 return true;
1532 }
1533 if (!RC.contains(Reg))
1534 return diagnoseRegisterClass(A->RegisterName);
1536 } else
1537 Arg = ArgDescriptor::createStack(A->StackOffset);
1538 // Check and apply the optional mask.
1539 if (A->Mask)
1540 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1541
1542 MFI->NumUserSGPRs += UserSGPRs;
1543 MFI->NumSystemSGPRs += SystemSGPRs;
1544 return false;
1545 };
1546
1547 if (YamlMFI.ArgInfo &&
1548 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1549 AMDGPU::SGPR_128RegClass,
1550 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1551 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1552 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1553 2, 0) ||
1554 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1555 MFI->ArgInfo.QueuePtr, 2, 0) ||
1556 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1557 AMDGPU::SReg_64RegClass,
1558 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1559 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1560 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1561 2, 0) ||
1562 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1563 AMDGPU::SReg_64RegClass,
1564 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1565 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1566 AMDGPU::SGPR_32RegClass,
1567 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1568 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1569 AMDGPU::SGPR_32RegClass,
1570 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1571 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1572 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1573 0, 1) ||
1574 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1575 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1576 0, 1) ||
1577 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1578 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1579 0, 1) ||
1580 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1581 AMDGPU::SGPR_32RegClass,
1582 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1583 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1584 AMDGPU::SGPR_32RegClass,
1585 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1586 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1587 AMDGPU::SReg_64RegClass,
1588 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1589 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1590 AMDGPU::SReg_64RegClass,
1591 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1592 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1593 AMDGPU::VGPR_32RegClass,
1594 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1595 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1596 AMDGPU::VGPR_32RegClass,
1597 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1598 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1599 AMDGPU::VGPR_32RegClass,
1600 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1601 return true;
1602
1603 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1604 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1605
1606 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1607 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1610 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1613
1620
1621 return false;
1622}
static cl::opt< bool > EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(true))
This is the AMGPU address space based alias analysis pass.
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
static cl::opt< bool > EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc"))
static cl::opt< bool, true > EnableLowerModuleLDS("amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), cl::Hidden)
static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler)
static ScheduleDAGInstrs * createIterativeILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EarlyInlineAll("amdgpu-early-inline-all", cl::desc("Inline all functions early"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableLowerKernelArguments("amdgpu-ir-lower-kernel-arguments", cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createGCNMaxILPMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableSDWAPeephole("amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), cl::init(true))
static MachineSchedRegistry GCNMinRegSchedRegistry("gcn-iterative-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler)
static cl::opt< bool > EnableSIModeRegisterPass("amdgpu-mode-register", cl::desc("Enable mode register pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableDPPCombine("amdgpu-dpp-combine", cl::desc("Enable DPP combiner"), cl::init(true))
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry("gcn-iterative-max-occupancy-experimental", "Run GCN scheduler to maximize occupancy (experimental)", createIterativeGCNMaxOccupancyMachineScheduler)
static cl::opt< bool > EnableSetWavePriority("amdgpu-set-wave-priority", cl::desc("Adjust wave priority"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableLDSReplaceWithPointer("amdgpu-enable-lds-replace-with-pointer", cl::desc("Enable LDS replace with pointer pass"), cl::init(false), cl::Hidden)
static cl::opt< bool > OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true))
static cl::opt< bool > EnablePromoteKernelArguments("amdgpu-enable-promote-kernel-arguments", cl::desc("Enable promotion of flat kernel pointer arguments to global"), cl::Hidden, cl::init(true))
static cl::opt< bool > EnableLibCallSimplify("amdgpu-simplify-libcall", cl::desc("Enable amdgpu library simplifications"), cl::init(true), cl::Hidden)
static MachineSchedRegistry GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp", createGCNMaxILPMachineScheduler)
static cl::opt< bool > InternalizeSymbols("amdgpu-internalize-symbols", cl::desc("Enable elimination of non-kernel functions and unused globals"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableAtomicOptimizations("amdgpu-atomic-optimizations", cl::desc("Enable atomic optimizations"), cl::init(false), cl::Hidden)
static LLVM_READNONE StringRef getGPUOrDefault(const Triple &TT, StringRef GPU)
static Reloc::Model getEffectiveRelocModel(std::optional< Reloc::Model > RM)
static cl::opt< bool > EnableStructurizerWorkarounds("amdgpu-enable-structurizer-workarounds", cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true))
static ScheduleDAGInstrs * createMinRegScheduler(MachineSchedContext *C)
static cl::opt< bool, true > LateCFGStructurize("amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden)
static cl::opt< bool > EnableInsertDelayAlu("amdgpu-enable-delay-alu", cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer", cl::desc("Enable load store vectorizer"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableMaxIlpSchedStrategy("amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), cl::Hidden, cl::init(false))
static bool mustPreserveGV(const GlobalValue &GV)
Predicate for Internalize pass.
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget()
static cl::opt< bool > EnableSROA("amdgpu-sroa", cl::desc("Run SROA after promote alloca pass"), cl::ReallyHidden, cl::init(true))
static cl::opt< bool > RemoveIncompatibleFunctions("amdgpu-enable-remove-incompatible-functions", cl::Hidden, cl::desc("Enable removal of functions when they" "use features not supported by the target GPU"), cl::init(true))
static cl::opt< bool > EnableScalarIRPasses("amdgpu-scalar-ir-passes", cl::desc("Enable scalar IR passes"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableRegReassign("amdgpu-reassign-regs", cl::desc("Enable register reassign optimizations on gfx10+"), cl::init(true), cl::Hidden)
static cl::opt< bool > OptVGPRLiveRange("amdgpu-opt-vgpr-liverange", cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden)
static ScheduleDAGInstrs * createSIMachineScheduler(MachineSchedContext *C)
static cl::opt< bool > EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations", cl::desc("Enable Pre-RA optimizations pass"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableVOPD("amdgpu-enable-vopd", cl::desc("Enable VOPD, dual issue of VALU in wave32"), cl::init(true), cl::Hidden)
static cl::opt< bool > EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, cl::desc("Run early if-conversion"), cl::init(false))
static ScheduleDAGInstrs * createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C)
static MachineSchedRegistry GCNILPSchedRegistry("gcn-iterative-ilp", "Run GCN iterative scheduler for ILP scheduling (experimental)", createIterativeILPMachineScheduler)
static cl::opt< bool > ScalarizeGlobal("amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), cl::init(true), cl::Hidden)
static const char RegAllocOptNotSupportedMessage[]
static MachineSchedRegistry GCNMaxOccupancySchedRegistry("gcn-max-occupancy", "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler)
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file declares the AMDGPU-specific subclass of TargetLoweringObjectFile.
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
Provides passes to inlining "always_inline" functions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This header provides classes for managing passes over SCCs of the call graph.
Provides analysis for continuously CSEing during GISel passes.
#define LLVM_READNONE
Definition: Compiler.h:189
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:127
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file defines the class GCNIterativeScheduler, which uses an iterative approach to find a best sc...
This file provides the interface for LLVM's Global Value Numbering pass which eliminates fully redund...
This file declares the IRTranslator pass.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static std::string computeDataLayout()
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
PassBuilder PB(Machine, PassOpts->PTO, std::nullopt, &PIC)
This header defines various interfaces for pass management in LLVM.
The AMDGPU TargetMachine interface definition for hw codegen targets.
Basic Register Allocator
This file describes the interface of the MachineFunctionPass responsible for assigning the generic vi...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Machine Scheduler interface.
static FunctionPass * useDefaultRegisterAllocator()
-regalloc=... command line option.
Target-Independent Code Generator Pass Configuration Options pass.
static std::unique_ptr< TargetLoweringObjectFile > createTLOF()
static const char PassName[]
A manager for alias analyses.
void registerFunctionAnalysis()
Register a specific AA result.
void addAAResult(AAResultT &AAResult)
Register a specific AA result.
Legacy wrapper pass to provide the AMDGPUAAResult object.
Analysis pass providing a never-invalidated alias analysis result.
Lower llvm.global_ctors and llvm.global_dtors to special kernels.
AMDGPUTargetMachine & getAMDGPUTargetMachine() const
std::unique_ptr< CSEConfigBase > getCSEConfig() const override
Returns the CSEConfig object to use for the current optimization level.
ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override
Create an instance of ScheduleDAGInstrs to be run within the standard MachineScheduler pass for this ...
bool addPreISel() override
Methods with trivial inline returns are convenient points in the common codegen pass pipeline where t...
bool isPassEnabled(const cl::opt< bool > &Opt, CodeGenOpt::Level Level=CodeGenOpt::Default) const
Check if a pass is enabled given Opt option.
bool addInstSelector() override
addInstSelector - This method should install an instruction selector pass, which converts from LLVM c...
bool addGCPasses() override
addGCPasses - Add late codegen passes that analyze code for garbage collection.
AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
void addIRPasses() override
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
void addCodeGenPrepare() override
Add pass to prepare the LLVM IR for code generation.
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
unsigned getAddressSpaceForPseudoSourceKind(unsigned Kind) const override
getAddressSpaceForPseudoSourceKind - Given the kind of memory (e.g.
const TargetSubtargetInfo * getSubtargetImpl() const
void registerDefaultAliasAnalyses(AAManager &) override
Allow the target to register alias analyses with the AAManager for use with the new pass manager.
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const override
If the specified predicate checks whether a generic pointer falls within a specified address space,...
StringRef getFeatureString(const Function &F) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
void registerPassBuilderCallbacks(PassBuilder &PB) override
Allow the target to modify the pass pipeline.
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOpt::Level OL)
StringRef getGPUName(const Function &F) const
unsigned getAssumedAddrSpace(const Value *V) const override
If the specified generic pointer could be assumed as a pointer to a specific address space,...
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:620
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:836
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:317
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition: Attributes.h:187
Allocate memory in an ever growing pool, as if by bump-pointer.
Definition: Allocator.h:66
void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Definition: Constants.cpp:708
Lightweight error class with error context and mandatory checking.
Definition: Error.h:156
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:235
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange) const override
Parse out the target's MachineFunctionInfo from the YAML reprsentation.
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM, CodeGenOpt::Level OL, bool JIT)
yaml::MachineFunctionInfo * convertFuncInfoToYAML(const MachineFunction &MF) const override
Allocate and initialize an instance of the YAML representation of the MachineFunctionInfo.
yaml::MachineFunctionInfo * createDefaultFuncInfoYAML() const override
Allocate and return a default initialized instance of the YAML representation for the MachineFunction...
TargetPassConfig * createPassConfig(PassManagerBase &PM) override
Create a pass configuration object to be used by addPassToEmitX methods for generating a pipeline of ...
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
Pass to remove unused function declarations.
Definition: GlobalDCE.h:36
This pass is responsible for selecting generic machine instructions to target-specific instructions.
A pass that internalizes all functions and variables other than those that must be preserved accordin...
Definition: Internalize.h:35
This class describes a target machine that is implemented with the LLVM target-independent code gener...
This pass implements the localization mechanism described at the top of this file.
Definition: Localizer.h:43
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineSchedRegistry provides a selection of available machine instruction schedulers.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
Definition: MemoryBuffer.h:51
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Definition: MemoryBuffer.h:76
static const OptimizationLevel O0
Disable as many optimizations as possible.
unsigned getSpeedupLevel() const
static const OptimizationLevel O1
Optimize quickly without destroying debuggability.
This class provides access to building LLVM's passes.
Definition: PassBuilder.h:100
void registerPipelineEarlySimplificationEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:452
void registerPipelineStartEPCallback(const std::function< void(ModulePassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:443
void registerParseAACallback(const std::function< bool(StringRef Name, AAManager &AA)> &C)
Register a callback for parsing an AliasAnalysis Name to populate the given AAManager AA.
Definition: PassBuilder.h:495
void registerAnalysisRegistrationCallback(const std::function< void(CGSCCAnalysisManager &)> &C)
{{@ Register callbacks for analysis registration with this PassBuilder instance.
Definition: PassBuilder.h:503
void registerCGSCCOptimizerLateEPCallback(const std::function< void(CGSCCPassManager &, OptimizationLevel)> &C)
Register a callback for a default optimizer pipeline extension point.
Definition: PassBuilder.h:422
void registerPipelineParsingCallback(const std::function< bool(StringRef Name, CGSCCPassManager &, ArrayRef< PipelineElement >)> &C)
{{@ Register pipeline parsing callbacks with this pass builder instance.
Definition: PassBuilder.h:525
LLVM_ATTRIBUTE_MINSIZE std::enable_if_t<!std::is_same< PassT, PassManager >::value > addPass(PassT &&Pass)
Definition: PassManager.h:544
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:38
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override
Create the target's instance of MachineFunctionInfo.
RegisterPassParser class - Handle the addition of new machine passes.
RegisterRegAllocBase class - Track the registration of register allocators.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition: SourceMgr.h:281
Represents a location in source code.
Definition: SMLoc.h:23
Represents a range in source code.
Definition: SMLoc.h:48
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:557
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:558
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
unsigned getMainFileID() const
Definition: SourceMgr.h:132
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition: SourceMgr.h:125
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
Triple TargetTriple
Triple string, CPU name, and target feature strings the TargetMachine instance is created with.
Definition: TargetMachine.h:97
const Triple & getTargetTriple() const
CodeGenOpt::Level getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const MCSubtargetInfo * getMCSubtargetInfo() const
StringRef getTargetFeatureString() const
StringRef getTargetCPU() const
std::unique_ptr< const MCSubtargetInfo > STI
void resetTargetOptions(const Function &F) const
Reset the target options based on the function's attributes.
std::unique_ptr< const MCRegisterInfo > MRI
Target-Independent Code Generator Pass Configuration Options.
LLVMTargetMachine * TM
virtual void addCodeGenPrepare()
Add pass to prepare the LLVM IR for code generation.
virtual bool addILPOpts()
Add passes that optimize instruction level parallelism for out-of-order targets.
virtual void addPostRegAlloc()
This method may be implemented by targets that want to run passes after register allocation pass pipe...
virtual void addOptimizedRegAlloc()
addOptimizedRegAlloc - Add passes related to register allocation.
CodeGenOpt::Level getOptLevel() const
virtual void addIRPasses()
Add common target configurable passes that perform LLVM IR to IR transforms following machine indepen...
virtual void addFastRegAlloc()
addFastRegAlloc - Add the minimum set of target-independent passes that are required for fast registe...
virtual void addMachineSSAOptimization()
addMachineSSAOptimization - Add standard passes that optimize machine instructions in SSA form.
void disablePass(AnalysisID PassID)
Allow the target to disable a specific standard pass by default.
AnalysisID addPass(AnalysisID PassID)
Utilities for targets to add passes to the pass manager.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:356
LLVM Value Representation.
Definition: Value.h:74
bool use_empty() const
Definition: Value.h:344
PassManagerBase - An abstract interface to allow code to add passes to a pass manager without having ...
Interfaces for registering analysis passes, producing common pass manager configurations,...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:378
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:381
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:380
@ UNKNOWN_ADDRESS_SPACE
Definition: AMDGPU.h:417
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:376
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:377
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:382
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:424
bool isEntryFunctionCC(CallingConv::ID CC)
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Level
Code generation optimization level.
Definition: CodeGen.h:57
@ Aggressive
-O3
Definition: CodeGen.h:61
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
Definition: PatternMatch.h:790
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:465
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
FunctionPass * createFlattenCFGPass()
void initializeSIFormMemoryClausesPass(PassRegistry &)
char & SIPreAllocateWWMRegsID
FunctionPass * createFastRegisterAllocator()
FastRegisterAllocation Pass - This pass register allocates as fast as possible.
char & EarlyMachineLICMID
This pass performs loop invariant code motion on machine instructions.
ImmutablePass * createAMDGPUAAWrapperPass()
char & PostRAHazardRecognizerID
PostRAHazardRecognizer - This pass runs the post-ra hazard recognizer.
FunctionPass * createAMDGPUSetWavePriorityPass()
Pass * createLCSSAPass()
Definition: LCSSA.cpp:491
void initializeAMDGPUUseNativeCallsPass(PassRegistry &)
void initializeGCNCreateVOPDPass(PassRegistry &)
ModulePass * createAMDGPUOpenCLEnqueuedBlockLoweringPass()
char & GCNPreRAOptimizationsID
char & GCLoweringID
GCLowering Pass - Used by gc.root to perform its default lowering operations.
void initializeGCNPreRAOptimizationsPass(PassRegistry &)
Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
void initializeAMDGPUDAGToDAGISelPass(PassRegistry &)
char & SIPostRABundlerID
FunctionPass * createSIModeRegisterPass()
FunctionPass * createGreedyRegisterAllocator()
Greedy register allocation pass - This pass implements a global register allocator for optimized buil...
void initializeAMDGPUAAWrapperPassPass(PassRegistry &)
void initializeR600ClauseMergePassPass(PassRegistry &)
void initializeSIModeRegisterPass(PassRegistry &)
ModulePass * createAMDGPUCtorDtorLoweringLegacyPass()
void initializeSIOptimizeVGPRLiveRangePass(PassRegistry &)
ModuleToFunctionPassAdaptor createModuleToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
Definition: PassManager.h:1218
void initializeAMDGPULateCodeGenPreparePass(PassRegistry &)
void initializeAMDGPUAttributorPass(PassRegistry &)
FunctionPass * createAMDGPUPreLegalizeCombiner(bool IsOptNone)
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry &)
FunctionPass * createAMDGPUISelDag(TargetMachine &TM, CodeGenOpt::Level OptLevel)
This pass converts a legalized DAG into a AMDGPU-specific.
void initializeSIShrinkInstructionsPass(PassRegistry &)
char & SIFoldOperandsID
FunctionPass * createAtomicExpandPass()
AtomicExpandPass - At IR level this pass replace atomic instructions with __atomic_* library calls,...
void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &)
char & SILoadStoreOptimizerID
FunctionPass * createNaryReassociatePass()
char & PatchableFunctionID
This pass implements the "patchable-function" attribute.
char & PostRASchedulerID
PostRAScheduler - This pass performs post register allocation scheduling.
void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &)
void initializeR600PacketizerPass(PassRegistry &)
char & AMDGPUReleaseVGPRsID
std::unique_ptr< ScheduleDAGMutation > createVOPDPairingMutation()
ModulePass * createAMDGPULowerIntrinsicsPass()
ModulePass * createAMDGPUAlwaysInlinePass(bool GlobalOpt=true)
void initializeSIPreEmitPeepholePass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createStoreClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
void initializeSIFixVGPRCopiesPass(PassRegistry &)
char & MachineSchedulerID
MachineScheduler - This pass schedules machine instructions.
Pass * createStructurizeCFGPass(bool SkipUniformRegions=false)
When SkipUniformRegions is true the structizer will not structurize regions that only contain uniform...
void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &)
void initializeGCNNSAReassignPass(PassRegistry &)
std::unique_ptr< CSEConfigBase > getStandardCSEConfigForOpt(CodeGenOpt::Level Level)
Definition: CSEInfo.cpp:79
char & PostMachineSchedulerID
PostMachineScheduler - This pass schedules machine instructions postRA.
void initializeSIInsertWaitcntsPass(PassRegistry &)
Pass * createLICMPass()
Definition: LICM.cpp:359
ScheduleDAGMILive * createGenericSchedLive(MachineSchedContext *C)
Create the standard converging machine scheduler.
char & SIFormMemoryClausesID
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &)
void initializeAMDGPURegBankCombinerPass(PassRegistry &)
void initializeSILoadStoreOptimizerPass(PassRegistry &)
void initializeSILateBranchLoweringPass(PassRegistry &)
void initializeSIPeepholeSDWAPass(PassRegistry &)
char & AMDGPUUnifyDivergentExitNodesID
ModulePass * createAMDGPULowerModuleLDSPass()
char & ShadowStackGCLoweringID
ShadowStackGCLowering - Implements the custom lowering mechanism used by the shadow stack GC.
char & GCNNSAReassignID
void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &)
void initializeAMDGPUExternalAAWrapperPass(PassRegistry &)
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &)
char & AMDGPUPerfHintAnalysisID
char & SILowerSGPRSpillsID
CodeModel::Model getEffectiveCodeModel(std::optional< CodeModel::Model > CM, CodeModel::Model Default)
Helper method for getting the code model, returning Default if CM does not have a value.
char & SILateBranchLoweringPassID
void initializeAMDGPURewriteUndefForPHIPass(PassRegistry &)
char & BranchRelaxationPassID
BranchRelaxation - This pass replaces branches that need to jump further than is supported by a branc...
FunctionPass * createSinkingPass()
Definition: Sink.cpp:277
CGSCCToFunctionPassAdaptor createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false, bool NoRerun=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation()
FunctionPass * createSIShrinkInstructionsPass()
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &)
void initializeSIPostRABundlerPass(PassRegistry &)
FunctionPass * createAMDGPUAtomicOptimizerPass()
void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry &)
void initializeAMDGPULowerModuleLDSPass(PassRegistry &)
std::unique_ptr< ScheduleDAGMutation > createLoadClusterDAGMutation(const TargetInstrInfo *TII, const TargetRegisterInfo *TRI)
void initializeSIWholeQuadModePass(PassRegistry &)
FunctionPass * createAMDGPULowerKernelArgumentsPass()
char & AMDGPUInsertDelayAluID
Pass * createAMDGPUAnnotateKernelFeaturesPass()
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
std::unique_ptr< ScheduleDAGMutation > createAMDGPUMacroFusionDAGMutation()
Note that you have to add: DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); to AMDGPUPassConfig...
char & StackMapLivenessID
StackMapLiveness - This pass analyses the register live-out set of stackmap/patchpoint intrinsics and...
char & SIOptimizeVGPRLiveRangeID
FunctionPass * createUnifyLoopExitsPass()
char & SIOptimizeExecMaskingPreRAID
FunctionPass * createFixIrreduciblePass()
FunctionPass * createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *)
char & FuncletLayoutID
This pass lays out funclets contiguously.
void initializeSIInsertHardClausesPass(PassRegistry &)
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &)
char & DetectDeadLanesID
This pass adds dead/undef flags after analyzing subregister lanes.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
void initializeSIAnnotateControlFlowPass(PassRegistry &)
ModulePass * createAMDGPUPrintfRuntimeBinding()
void initializeSIMemoryLegalizerPass(PassRegistry &)
Pass * createAlwaysInlinerLegacyPass(bool InsertLifetime=true)
Create a legacy pass manager instance of a pass to inline and remove functions marked as "always_inli...
Target & getTheAMDGPUTarget()
The target which supports all AMD GPUs.
void initializeR600ControlFlowFinalizerPass(PassRegistry &)
ModulePass * createAMDGPUReplaceLDSUseWithPointerPass()
void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &)
FunctionPass * createAMDGPUAnnotateUniformValues()
void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &)
FunctionPass * createAMDGPUPromoteAlloca()
FunctionPass * createSeparateConstOffsetFromGEPPass(bool LowerGEP=false)
char & EarlyIfConverterID
EarlyIfConverter - This pass performs if-conversion on SSA form by inserting cmov instructions.
char & SIPreEmitPeepholeID
ModulePass * createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *)
FunctionPass * createSILowerI1CopiesPass()
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &)
FunctionPass * createBasicRegisterAllocator()
BasicRegisterAllocation Pass - This pass implements a degenerate global register allocator using the ...
void initializeGlobalISel(PassRegistry &)
Initialize all passes linked into the GlobalISel library.
Definition: GlobalISel.cpp:17
void initializeSIPreAllocateWWMRegsPass(PassRegistry &)
ModulePass * createR600OpenCLImageTypeLoweringPass()
FunctionPass * createAMDGPUCodeGenPreparePass()
Target & getTheGCNTarget()
The target for GCN GPUs.
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &)
ModulePass * createBarrierNoopPass()
createBarrierNoopPass - This pass is purely a module pass barrier in a pass manager.
char & MachineCSEID
MachineCSE - This pass performs global CSE on machine instructions.
Definition: MachineCSE.cpp:162
char & GCNDPPCombineID
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &)
FunctionPass * createAMDGPURewriteUndefForPHIPass()
FunctionPass * createAMDGPURegBankCombiner(bool IsOptNone)
char & SIWholeQuadModeID
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
char & LiveVariablesID
LiveVariables pass - This pass computes the set of blocks in which each variable is life and sets mac...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
FunctionPass * createGVNPass(bool NoMemDepAnalysis=false)
Create a legacy GVN pass.
Definition: GVN.cpp:3211
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:87
void initializeSILowerSGPRSpillsPass(PassRegistry &)
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &)
char & SIInsertHardClausesID
FunctionPass * createAMDGPUMachineCFGStructurizerPass()
void initializeAMDGPUResourceUsageAnalysisPass(PassRegistry &)
void initializeSIFixSGPRCopiesPass(PassRegistry &)
char & GCNCreateVOPDID
FunctionPass * createInferAddressSpacesPass(unsigned AddressSpace=~0u)
Pass * createAMDGPUAttributorPass()
char & VirtRegRewriterID
VirtRegRewriter pass.
Definition: VirtRegMap.cpp:227
void initializeSILowerI1CopiesPass(PassRegistry &)
char & SILowerControlFlowID
FunctionPass * createLowerSwitchPass()
FunctionPass * createVirtRegRewriter(bool ClearVirtRegs=true)
Definition: VirtRegMap.cpp:646
void initializeR600VectorRegMergerPass(PassRegistry &)
ImmutablePass * createExternalAAWrapperPass(std::function< void(Pass &, Function &, AAResults &)> Callback)
A wrapper pass around a callback which can be used to populate the AAResults in the AAResultsWrapperP...
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
FunctionPass * createSIMemoryLegalizerPass()
void initializeSIFoldOperandsPass(PassRegistry &)
void initializeSILowerControlFlowPass(PassRegistry &)
char & SIPeepholeSDWAID
char & SIFixVGPRCopiesID
char & TwoAddressInstructionPassID
TwoAddressInstruction - This pass reduces two-address instructions to use two operands.
void initializeAMDGPUReleaseVGPRsPass(PassRegistry &)
void initializeAMDGPURegBankSelectPass(PassRegistry &)
MCRegisterInfo * createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour)
FunctionPass * createStraightLineStrengthReducePass()
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
FunctionPass * createSIInsertWaitcntsPass()
FunctionPass * createEarlyCSEPass(bool UseMemorySSA=false)
Definition: EarlyCSE.cpp:1793
void initializeGCNDPPCombinePass(PassRegistry &)
char & PHIEliminationID
PHIElimination - This pass eliminates machine instruction PHI nodes by inserting copy instructions.
bool parseNamedRegisterReference(PerFunctionMIParsingState &PFS, Register &Reg, StringRef Src, SMDiagnostic &Error)
Definition: MIParser.cpp:3577
FunctionPass * createAMDGPULateCodeGenPreparePass()
FunctionPass * createSROAPass(bool PreserveCFG=true)
Definition: SROA.cpp:5163
char & RenameIndependentSubregsID
This pass detects subregister lanes in a virtual register that are used independently of other lanes ...
std::unique_ptr< ScheduleDAGMutation > createAMDGPUExportClusteringDAGMutation()
void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry &)
void initializeAMDGPUPromoteAllocaPass(PassRegistry &)
void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &)
void initializeAMDGPUInsertDelayAluPass(PassRegistry &)
char & SIOptimizeExecMaskingID
void initializeAMDGPUUnifyMetadataPass(PassRegistry &)
char & SIFixSGPRCopiesID
FunctionPass * createSIAnnotateControlFlowPass()
Create the annotation pass.
void initializeAMDGPUAlwaysInlinePass(PassRegistry &)
char & DeadMachineInstructionElimID
DeadMachineInstructionElim - This pass removes dead machine instructions.
void initializeAMDGPUPromoteKernelArgumentsPass(PassRegistry &)
#define N
static constexpr ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static constexpr ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ IEEE
IEEE-754 denormal numbers preserved.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
RegisterTargetMachine - Helper template for registering a target machine implementation,...
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
The llvm::once_flag structure.
Definition: Threading.h:68
Targets should override this in a way that mirrors the implementation of llvm::MachineFunctionInfo.
SmallVector< StringValue > WWMReservedRegs
std::optional< SIArgumentInfo > ArgInfo
A wrapper around std::string which contains a source range that's being set during parsing.