LLVM 20.0.0git
PassBuilderPipelines.cpp
Go to the documentation of this file.
1//===- Construction of pass pipelines -------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file provides the implementation of the PassBuilder based on our
11/// static pass registry as well as related functionality. It also provides
12/// helpers to aid in analyzing, debugging, and testing passes and pass
13/// pipelines.
14///
15//===----------------------------------------------------------------------===//
16
17#include "llvm/ADT/Statistic.h"
26#include "llvm/IR/PassManager.h"
142
143using namespace llvm;
144
146 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
147 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
148 cl::values(clEnumValN(InliningAdvisorMode::Default, "default",
149 "Heuristics-based inliner version"),
150 clEnumValN(InliningAdvisorMode::Development, "development",
151 "Use development mode (runtime-loadable model)"),
152 clEnumValN(InliningAdvisorMode::Release, "release",
153 "Use release mode (AOT-compiled model)")));
154
156 "enable-npm-synthetic-counts", cl::Hidden,
157 cl::desc("Run synthetic function entry count generation "
158 "pass"));
159
160/// Flag to enable inline deferral during PGO.
161static cl::opt<bool>
162 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true),
164 cl::desc("Enable inline deferral during PGO"));
165
166static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
167 cl::init(false), cl::Hidden,
168 cl::desc("Enable module inliner"));
169
171 "mandatory-inlining-first", cl::init(false), cl::Hidden,
172 cl::desc("Perform mandatory inlinings module-wide, before performing "
173 "inlining"));
174
176 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden,
177 cl::desc("Eagerly invalidate more analyses in default pipelines"));
178
180 "enable-merge-functions", cl::init(false), cl::Hidden,
181 cl::desc("Enable function merging as part of the optimization pipeline"));
182
184 "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden,
185 cl::desc("Run the loop rotation transformation after PGO instrumentation"));
186
188 "enable-global-analyses", cl::init(true), cl::Hidden,
189 cl::desc("Enable inter-procedural analyses"));
190
191static cl::opt<bool>
192 RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
193 cl::desc("Run Partial inlinining pass"));
194
196 "extra-vectorizer-passes", cl::init(false), cl::Hidden,
197 cl::desc("Run cleanup optimization passes after vectorization"));
198
199static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
200 cl::desc("Run the NewGVN pass"));
201
203 "enable-loopinterchange", cl::init(false), cl::Hidden,
204 cl::desc("Enable the experimental LoopInterchange Pass"));
205
206static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
207 cl::init(false), cl::Hidden,
208 cl::desc("Enable Unroll And Jam Pass"));
209
210static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
212 cl::desc("Enable the LoopFlatten Pass"));
213
214// Experimentally allow loop header duplication. This should allow for better
215// optimization at Oz, since loop-idiom recognition can then recognize things
216// like memcpy. If this ends up being useful for many targets, we should drop
217// this flag and make a code generation option that can be controlled
218// independent of the opt level and exposed through the frontend.
220 "enable-loop-header-duplication", cl::init(false), cl::Hidden,
221 cl::desc("Enable loop header duplication at any optimization level"));
222
223static cl::opt<bool>
224 EnableDFAJumpThreading("enable-dfa-jump-thread",
225 cl::desc("Enable DFA jump threading"),
226 cl::init(false), cl::Hidden);
227
228// TODO: turn on and remove flag
230 "enable-pgo-force-function-attrs",
231 cl::desc("Enable pass to set function attributes based on PGO profiles"),
232 cl::init(false));
233
234static cl::opt<bool>
235 EnableHotColdSplit("hot-cold-split",
236 cl::desc("Enable hot-cold splitting pass"));
237
238static cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false),
240 cl::desc("Enable ir outliner pass"));
241
242static cl::opt<bool>
243 DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
244 cl::desc("Disable pre-instrumentation inliner"));
245
247 "preinline-threshold", cl::Hidden, cl::init(75),
248 cl::desc("Control the amount of inlining in pre-instrumentation inliner "
249 "(default = 75)"));
250
251static cl::opt<bool>
252 EnableGVNHoist("enable-gvn-hoist",
253 cl::desc("Enable the GVN hoisting pass (default = off)"));
254
255static cl::opt<bool>
256 EnableGVNSink("enable-gvn-sink",
257 cl::desc("Enable the GVN sinking pass (default = off)"));
258
260 "enable-jump-table-to-switch",
261 cl::desc("Enable JumpTableToSwitch pass (default = off)"));
262
263// This option is used in simplifying testing SampleFDO optimizations for
264// profile loading.
265static cl::opt<bool>
266 EnableCHR("enable-chr", cl::init(true), cl::Hidden,
267 cl::desc("Enable control height reduction optimization (CHR)"));
268
270 "flattened-profile-used", cl::init(false), cl::Hidden,
271 cl::desc("Indicate the sample profile being used is flattened, i.e., "
272 "no inline hierachy exists in the profile"));
273
275 "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
276 cl::desc("Enable order file instrumentation (default = off)"));
277
278static cl::opt<bool>
279 EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
280 cl::desc("Enable lowering of the matrix intrinsics"));
281
283 "enable-constraint-elimination", cl::init(true), cl::Hidden,
284 cl::desc(
285 "Enable pass to eliminate conditions based on linear constraints"));
286
288 "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE),
289 cl::desc("Enable the attributor inter-procedural deduction pass"),
290 cl::values(clEnumValN(AttributorRunOption::ALL, "all",
291 "enable all attributor runs"),
292 clEnumValN(AttributorRunOption::MODULE, "module",
293 "enable module-wide attributor runs"),
294 clEnumValN(AttributorRunOption::CGSCC, "cgscc",
295 "enable call graph SCC attributor runs"),
296 clEnumValN(AttributorRunOption::NONE, "none",
297 "disable attributor runs")));
298
300 "enable-sampled-instrumentation", cl::init(false), cl::Hidden,
301 cl::desc("Enable profile instrumentation sampling (default = off)"));
303 "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
304 cl::desc("Enable the experimental Loop Versioning LICM pass"));
305
306namespace llvm {
308
310} // namespace llvm
311
313 LoopInterleaving = true;
314 LoopVectorization = true;
315 SLPVectorization = false;
316 LoopUnrolling = true;
320 CallGraphProfile = true;
321 UnifiedLTO = false;
323 InlinerThreshold = -1;
325}
326
327namespace llvm {
329} // namespace llvm
330
332 OptimizationLevel Level) {
333 for (auto &C : PeepholeEPCallbacks)
334 C(FPM, Level);
335}
338 for (auto &C : LateLoopOptimizationsEPCallbacks)
339 C(LPM, Level);
340}
342 OptimizationLevel Level) {
343 for (auto &C : LoopOptimizerEndEPCallbacks)
344 C(LPM, Level);
345}
348 for (auto &C : ScalarOptimizerLateEPCallbacks)
349 C(FPM, Level);
350}
352 OptimizationLevel Level) {
353 for (auto &C : CGSCCOptimizerLateEPCallbacks)
354 C(CGPM, Level);
355}
357 OptimizationLevel Level) {
358 for (auto &C : VectorizerStartEPCallbacks)
359 C(FPM, Level);
360}
362 OptimizationLevel Level) {
363 for (auto &C : OptimizerEarlyEPCallbacks)
364 C(MPM, Level);
365}
367 OptimizationLevel Level) {
368 for (auto &C : OptimizerLastEPCallbacks)
369 C(MPM, Level);
370}
373 for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks)
374 C(MPM, Level);
375}
378 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks)
379 C(MPM, Level);
380}
382 OptimizationLevel Level) {
383 for (auto &C : PipelineStartEPCallbacks)
384 C(MPM, Level);
385}
388 for (auto &C : PipelineEarlySimplificationEPCallbacks)
389 C(MPM, Level);
390}
391
392// Helper to add AnnotationRemarksPass.
395}
396
397// Helper to check if the current compilation phase is preparing for LTO
401}
402
403// TODO: Investigate the cost/benefit of tail call elimination on debugging.
405PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
407
409
412
413 // Form SSA out of local memory accesses after breaking apart aggregates into
414 // scalars.
416
417 // Catch trivial redundancies
418 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
419
420 // Hoisting of scalars and load expressions.
421 FPM.addPass(
422 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
424
426
427 invokePeepholeEPCallbacks(FPM, Level);
428
429 FPM.addPass(
430 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
431
432 // Form canonically associated expression trees, and simplify the trees using
433 // basic mathematical properties. For example, this will form (nearly)
434 // minimal multiplication trees.
436
437 // Add the primary loop simplification pipeline.
438 // FIXME: Currently this is split into two loop pass pipelines because we run
439 // some function passes in between them. These can and should be removed
440 // and/or replaced by scheduling the loop pass equivalents in the correct
441 // positions. But those equivalent passes aren't powerful enough yet.
442 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
443 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
444 // fully replace `SimplifyCFGPass`, and the closest to the other we have is
445 // `LoopInstSimplify`.
446 LoopPassManager LPM1, LPM2;
447
448 // Simplify the loop body. We do this initially to clean up after other loop
449 // passes run, either when iterating on a loop or on inner loops with
450 // implications on the outer loop.
453
454 // Try to remove as much code from the loop header as possible,
455 // to reduce amount of IR that will have to be duplicated. However,
456 // do not perform speculative hoisting the first time as LICM
457 // will destroy metadata that may not need to be destroyed if run
458 // after loop rotation.
459 // TODO: Investigate promotion cap for O1.
461 /*AllowSpeculation=*/false));
462
463 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true,
465 // TODO: Investigate promotion cap for O1.
467 /*AllowSpeculation=*/true));
470 LPM1.addPass(LoopFlattenPass());
471
474
476
478
481
482 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
483 // because it changes IR to makes profile annotation in back compile
484 // inaccurate. The normal unroller doesn't pay attention to forced full unroll
485 // attributes so we need to make sure and allow the full unroll pass to pay
486 // attention to it.
487 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
488 PGOOpt->Action != PGOOptions::SampleUse)
489 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
490 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
492
494
495 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
496 /*UseMemorySSA=*/true,
497 /*UseBlockFrequencyInfo=*/true));
498 FPM.addPass(
499 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
501 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
502 // *All* loop passes must preserve it, in order to be able to use it.
503 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
504 /*UseMemorySSA=*/false,
505 /*UseBlockFrequencyInfo=*/false));
506
507 // Delete small array after loop unroll.
509
510 // Specially optimize memory movement as it doesn't look like dataflow in SSA.
511 FPM.addPass(MemCpyOptPass());
512
513 // Sparse conditional constant propagation.
514 // FIXME: It isn't clear why we do this *after* loop passes rather than
515 // before...
516 FPM.addPass(SCCPPass());
517
518 // Delete dead bit computations (instcombine runs after to fold away the dead
519 // computations, and then ADCE will run later to exploit any new DCE
520 // opportunities that creates).
521 FPM.addPass(BDCEPass());
522
523 // Run instcombine after redundancy and dead bit elimination to exploit
524 // opportunities opened up by them.
526 invokePeepholeEPCallbacks(FPM, Level);
527
528 FPM.addPass(CoroElidePass());
529
531
532 // Finally, do an expensive DCE pass to catch all the dead code exposed by
533 // the simplifications and basic cleanup after all the simplifications.
534 // TODO: Investigate if this is too expensive.
535 FPM.addPass(ADCEPass());
536 FPM.addPass(
537 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
539 invokePeepholeEPCallbacks(FPM, Level);
540
541 return FPM;
542}
543
547 assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
548
549 // The O1 pipeline has a separate pipeline creation function to simplify
550 // construction readability.
551 if (Level.getSpeedupLevel() == 1)
552 return buildO1FunctionSimplificationPipeline(Level, Phase);
553
555
558
559 // Form SSA out of local memory accesses after breaking apart aggregates into
560 // scalars.
562
563 // Catch trivial redundancies
564 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
567
568 // Hoisting of scalars and load expressions.
569 if (EnableGVNHoist)
570 FPM.addPass(GVNHoistPass());
571
572 // Global value numbering based sinking.
573 if (EnableGVNSink) {
574 FPM.addPass(GVNSinkPass());
575 FPM.addPass(
576 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
577 }
578
579 // Speculative execution if the target has divergent branches; otherwise nop.
580 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true));
581
582 // Optimize based on known information about branches, and cleanup afterward.
585
586 // Jump table to switch conversion.
589
590 FPM.addPass(
591 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
594
595 if (!Level.isOptimizingForSize())
597
598 invokePeepholeEPCallbacks(FPM, Level);
599
600 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
601 // using the size value profile. Don't perform this when optimizing for size.
602 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
603 !Level.isOptimizingForSize())
605
607 FPM.addPass(
608 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
609
610 // Form canonically associated expression trees, and simplify the trees using
611 // basic mathematical properties. For example, this will form (nearly)
612 // minimal multiplication trees.
614
617
618 // Add the primary loop simplification pipeline.
619 // FIXME: Currently this is split into two loop pass pipelines because we run
620 // some function passes in between them. These can and should be removed
621 // and/or replaced by scheduling the loop pass equivalents in the correct
622 // positions. But those equivalent passes aren't powerful enough yet.
623 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
624 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
625 // fully replace `SimplifyCFGPass`, and the closest to the other we have is
626 // `LoopInstSimplify`.
627 LoopPassManager LPM1, LPM2;
628
629 // Simplify the loop body. We do this initially to clean up after other loop
630 // passes run, either when iterating on a loop or on inner loops with
631 // implications on the outer loop.
634
635 // Try to remove as much code from the loop header as possible,
636 // to reduce amount of IR that will have to be duplicated. However,
637 // do not perform speculative hoisting the first time as LICM
638 // will destroy metadata that may not need to be destroyed if run
639 // after loop rotation.
640 // TODO: Investigate promotion cap for O1.
642 /*AllowSpeculation=*/false));
643
644 // Disable header duplication in loop rotation at -Oz.
646 Level != OptimizationLevel::Oz,
648 // TODO: Investigate promotion cap for O1.
650 /*AllowSpeculation=*/true));
651 LPM1.addPass(
652 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3));
654 LPM1.addPass(LoopFlattenPass());
655
658
659 {
661 ExtraPasses.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
663 LPM2.addPass(std::move(ExtraPasses));
664 }
665
667
669
672
673 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
674 // because it changes IR to makes profile annotation in back compile
675 // inaccurate. The normal unroller doesn't pay attention to forced full unroll
676 // attributes so we need to make sure and allow the full unroll pass to pay
677 // attention to it.
678 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt ||
679 PGOOpt->Action != PGOOptions::SampleUse)
680 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
681 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
683
685
686 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
687 /*UseMemorySSA=*/true,
688 /*UseBlockFrequencyInfo=*/true));
689 FPM.addPass(
690 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
692 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
693 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
694 // *All* loop passes must preserve it, in order to be able to use it.
695 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
696 /*UseMemorySSA=*/false,
697 /*UseBlockFrequencyInfo=*/false));
698
699 // Delete small array after loop unroll.
701
702 // Try vectorization/scalarization transforms that are both improvements
703 // themselves and can allow further folds with GVN and InstCombine.
704 FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true));
705
706 // Eliminate redundancies.
708 if (RunNewGVN)
709 FPM.addPass(NewGVNPass());
710 else
711 FPM.addPass(GVNPass());
712
713 // Sparse conditional constant propagation.
714 // FIXME: It isn't clear why we do this *after* loop passes rather than
715 // before...
716 FPM.addPass(SCCPPass());
717
718 // Delete dead bit computations (instcombine runs after to fold away the dead
719 // computations, and then ADCE will run later to exploit any new DCE
720 // opportunities that creates).
721 FPM.addPass(BDCEPass());
722
723 // Run instcombine after redundancy and dead bit elimination to exploit
724 // opportunities opened up by them.
726 invokePeepholeEPCallbacks(FPM, Level);
727
728 // Re-consider control flow based optimizations after redundancy elimination,
729 // redo DCE, etc.
732
735
736 // Finally, do an expensive DCE pass to catch all the dead code exposed by
737 // the simplifications and basic cleanup after all the simplifications.
738 // TODO: Investigate if this is too expensive.
739 FPM.addPass(ADCEPass());
740
741 // Specially optimize memory movement as it doesn't look like dataflow in SSA.
742 FPM.addPass(MemCpyOptPass());
743
744 FPM.addPass(DSEPass());
746
749 /*AllowSpeculation=*/true),
750 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
751
752 FPM.addPass(CoroElidePass());
753
755
757 .convertSwitchRangeToICmp(true)
758 .hoistCommonInsts(true)
759 .sinkCommonInsts(true)));
761 invokePeepholeEPCallbacks(FPM, Level);
762
763 return FPM;
764}
765
766void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
769}
770
771void PassBuilder::addPreInlinerPasses(ModulePassManager &MPM,
772 OptimizationLevel Level,
773 ThinOrFullLTOPhase LTOPhase) {
774 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
776 return;
777 InlineParams IP;
778
780
781 // FIXME: The hint threshold has the same value used by the regular inliner
782 // when not optimzing for size. This should probably be lowered after
783 // performance testing.
784 // FIXME: this comment is cargo culted from the old pass manager, revisit).
785 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
787 IP, /* MandatoryFirst */ true,
789 CGSCCPassManager &CGPipeline = MIWP.getPM();
790
793 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies.
794 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
795 true))); // Merge & remove basic blocks.
796 FPM.addPass(InstCombinePass()); // Combine silly sequences.
797 invokePeepholeEPCallbacks(FPM, Level);
798
799 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
800 std::move(FPM), PTO.EagerlyInvalidateAnalyses));
801
802 MPM.addPass(std::move(MIWP));
803
804 // Delete anything that is now dead to make sure that we don't instrument
805 // dead code. Instrumentation can end up keeping dead code around and
806 // dramatically increase code size.
808}
809
810void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM,
811 OptimizationLevel Level) {
813 // Disable header duplication in loop rotation at -Oz.
817 Level != OptimizationLevel::Oz),
818 /*UseMemorySSA=*/false,
819 /*UseBlockFrequencyInfo=*/false),
821 }
822}
823
824void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
825 OptimizationLevel Level, bool RunProfileGen,
826 bool IsCS, bool AtomicCounterUpdate,
827 std::string ProfileFile,
828 std::string ProfileRemappingFile,
830 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
831
832 if (!RunProfileGen) {
833 assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
834 MPM.addPass(
835 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS));
836 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
837 // RequireAnalysisPass for PSI before subsequent non-module passes.
839 return;
840 }
841
842 // Perform PGO instrumentation.
844
845 addPostPGOLoopRotation(MPM, Level);
846 // Add the profile lowering pass.
848 if (!ProfileFile.empty())
849 Options.InstrProfileOutput = ProfileFile;
850 // Do counter promotion at Level greater than O0.
851 Options.DoCounterPromotion = true;
852 Options.UseBFIInPromotion = IsCS;
853 if (EnableSampledInstr) {
854 Options.Sampling = true;
855 // With sampling, there is little beneifit to enable counter promotion.
856 // But note that sampling does work with counter promotion.
857 Options.DoCounterPromotion = false;
858 }
859 Options.Atomic = AtomicCounterUpdate;
861}
862
864 ModulePassManager &MPM, bool RunProfileGen, bool IsCS,
865 bool AtomicCounterUpdate, std::string ProfileFile,
866 std::string ProfileRemappingFile, IntrusiveRefCntPtr<vfs::FileSystem> FS) {
867 if (!RunProfileGen) {
868 assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
869 MPM.addPass(
870 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS));
871 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
872 // RequireAnalysisPass for PSI before subsequent non-module passes.
874 return;
875 }
876
877 // Perform PGO instrumentation.
879 // Add the profile lowering pass.
881 if (!ProfileFile.empty())
882 Options.InstrProfileOutput = ProfileFile;
883 // Do not do counter promotion at O0.
884 Options.DoCounterPromotion = false;
885 Options.UseBFIInPromotion = IsCS;
886 Options.Atomic = AtomicCounterUpdate;
888}
889
891 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel());
892}
893
897 InlineParams IP;
898 if (PTO.InlinerThreshold == -1)
899 IP = getInlineParamsFromOptLevel(Level);
900 else
902 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
903 // disable hot callsite inline (as much as possible [1]) because it makes
904 // profile annotation in the backend inaccurate.
905 //
906 // [1] Note the cost of a function could be below zero due to erased
907 // prologue / epilogue.
908 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
909 PGOOpt->Action == PGOOptions::SampleUse)
911
912 if (PGOOpt)
914
918
919 // Require the GlobalsAA analysis for the module so we can query it within
920 // the CGSCC pipeline.
923 // Invalidate AAManager so it can be recreated and pick up the newly
924 // available GlobalsAA.
925 MIWP.addModulePass(
927 }
928
929 // Require the ProfileSummaryAnalysis for the module so we can query it within
930 // the inliner pass.
932
933 // Now begin the main postorder CGSCC pipeline.
934 // FIXME: The current CGSCC pipeline has its origins in the legacy pass
935 // manager and trying to emulate its precise behavior. Much of this doesn't
936 // make a lot of sense and we should revisit the core CGSCC structure.
937 CGSCCPassManager &MainCGPipeline = MIWP.getPM();
938
939 // Note: historically, the PruneEH pass was run first to deduce nounwind and
940 // generally clean up exception handling overhead. It isn't clear this is
941 // valuable as the inliner doesn't currently care whether it is inlining an
942 // invoke or a call.
943
945 MainCGPipeline.addPass(AttributorCGSCCPass());
946
947 // Deduce function attributes. We do another run of this after the function
948 // simplification pipeline, so this only needs to run when it could affect the
949 // function simplification pipeline, which is only the case with recursive
950 // functions.
951 MainCGPipeline.addPass(PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true));
952
953 // When at O3 add argument promotion to the pass pipeline.
954 // FIXME: It isn't at all clear why this should be limited to O3.
955 if (Level == OptimizationLevel::O3)
956 MainCGPipeline.addPass(ArgumentPromotionPass());
957
958 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
959 // there are no OpenMP runtime calls present in the module.
960 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
961 MainCGPipeline.addPass(OpenMPOptCGSCCPass());
962
963 invokeCGSCCOptimizerLateEPCallbacks(MainCGPipeline, Level);
964
965 // Add the core function simplification pipeline nested inside the
966 // CGSCC walk.
969 PTO.EagerlyInvalidateAnalyses, /*NoRerun=*/true));
970
971 // Finally, deduce any function attributes based on the fully simplified
972 // function.
973 MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
974
975 // Mark that the function is fully simplified and that it shouldn't be
976 // simplified again if we somehow revisit it due to CGSCC mutations unless
977 // it's been modified since.
980
981 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0));
982
983 // Make sure we don't affect potential future NoRerun CGSCC adaptors.
984 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor(
986
987 return MIWP;
988}
989
994
996 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to
997 // disable hot callsite inline (as much as possible [1]) because it makes
998 // profile annotation in the backend inaccurate.
999 //
1000 // [1] Note the cost of a function could be below zero due to erased
1001 // prologue / epilogue.
1002 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt &&
1003 PGOOpt->Action == PGOOptions::SampleUse)
1004 IP.HotCallSiteThreshold = 0;
1005
1006 if (PGOOpt)
1008
1009 // The inline deferral logic is used to avoid losing some
1010 // inlining chance in future. It is helpful in SCC inliner, in which
1011 // inlining is processed in bottom-up order.
1012 // While in module inliner, the inlining order is a priority-based order
1013 // by default. The inline deferral is unnecessary there. So we disable the
1014 // inline deferral logic in module inliner.
1015 IP.EnableDeferral = false;
1016
1018
1022
1025
1026 return MPM;
1027}
1028
1032 assert(Level != OptimizationLevel::O0 &&
1033 "Should not be used for O0 pipeline");
1034
1036 "FullLTOPostLink shouldn't call buildModuleSimplificationPipeline!");
1037
1039
1040 // Place pseudo probe instrumentation as the first pass of the pipeline to
1041 // minimize the impact of optimization changes.
1042 if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
1045
1046 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse);
1047
1048 // In ThinLTO mode, when flattened profile is used, all the available
1049 // profile information will be annotated in PreLink phase so there is
1050 // no need to load the profile again in PostLink.
1051 bool LoadSampleProfile =
1052 HasSampleProfile &&
1054
1055 // During the ThinLTO backend phase we perform early indirect call promotion
1056 // here, before globalopt. Otherwise imported available_externally functions
1057 // look unreferenced and are removed. If we are going to load the sample
1058 // profile then defer until later.
1059 // TODO: See if we can move later and consolidate with the location where
1060 // we perform ICP when we are loading a sample profile.
1061 // TODO: We pass HasSampleProfile (whether there was a sample profile file
1062 // passed to the compile) to the SamplePGO flag of ICP. This is used to
1063 // determine whether the new direct calls are annotated with prof metadata.
1064 // Ideally this should be determined from whether the IR is annotated with
1065 // sample profile, and not whether the a sample profile was provided on the
1066 // command line. E.g. for flattened profiles where we will not be reloading
1067 // the sample profile in the ThinLTO backend, we ideally shouldn't have to
1068 // provide the sample profile file.
1069 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile)
1070 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile));
1071
1072 // Create an early function pass manager to cleanup the output of the
1073 // frontend. Not necessary with LTO post link pipelines since the pre link
1074 // pipeline already cleaned up the frontend output.
1076 // Do basic inference of function attributes from known properties of system
1077 // libraries and other oracles.
1080
1081 FunctionPassManager EarlyFPM;
1082 EarlyFPM.addPass(EntryExitInstrumenterPass(/*PostInlining=*/false));
1083 // Lower llvm.expect to metadata before attempting transforms.
1084 // Compare/branch metadata may alter the behavior of passes like
1085 // SimplifyCFG.
1087 EarlyFPM.addPass(SimplifyCFGPass());
1089 EarlyFPM.addPass(EarlyCSEPass());
1090 if (Level == OptimizationLevel::O3)
1091 EarlyFPM.addPass(CallSiteSplittingPass());
1093 std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses));
1094 }
1095
1096 if (LoadSampleProfile) {
1097 // Annotate sample profile right after early FPM to ensure freshness of
1098 // the debug info.
1099 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
1100 PGOOpt->ProfileRemappingFile, Phase));
1101 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
1102 // RequireAnalysisPass for PSI before subsequent non-module passes.
1104 // Do not invoke ICP in the LTOPrelink phase as it makes it hard
1105 // for the profile annotation to be accurate in the LTO backend.
1106 if (!isLTOPreLink(Phase))
1107 // We perform early indirect call promotion here, before globalopt.
1108 // This is important for the ThinLTO backend phase because otherwise
1109 // imported available_externally functions look unreferenced and are
1110 // removed.
1111 MPM.addPass(
1112 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */));
1113 }
1114
1115 // Try to perform OpenMP specific optimizations on the module. This is a
1116 // (quick!) no-op if there are no OpenMP runtime calls present in the module.
1118
1121
1122 // Lower type metadata and the type.test intrinsic in the ThinLTO
1123 // post link pipeline after ICP. This is to enable usage of the type
1124 // tests in ICP sequences.
1126 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1127
1129
1130 // Interprocedural constant propagation now that basic cleanup has occurred
1131 // and prior to optimizing globals.
1132 // FIXME: This position in the pipeline hasn't been carefully considered in
1133 // years, it should be re-analyzed.
1135 IPSCCPOptions(/*AllowFuncSpec=*/
1136 Level != OptimizationLevel::Os &&
1137 Level != OptimizationLevel::Oz &&
1138 !isLTOPreLink(Phase))));
1139
1140 // Attach metadata to indirect call sites indicating the set of functions
1141 // they may target at run-time. This should follow IPSCCP.
1143
1144 // Optimize globals to try and fold them into constants.
1146
1147 // Create a small function pass pipeline to cleanup after all the global
1148 // optimizations.
1149 FunctionPassManager GlobalCleanupPM;
1150 // FIXME: Should this instead by a run of SROA?
1151 GlobalCleanupPM.addPass(PromotePass());
1152 GlobalCleanupPM.addPass(InstCombinePass());
1153 invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
1154 GlobalCleanupPM.addPass(
1155 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
1156 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM),
1158
1159 // We already asserted this happens in non-FullLTOPostLink earlier.
1160 const bool IsPreLink = Phase != ThinOrFullLTOPhase::ThinLTOPostLink;
1161 const bool IsPGOPreLink = PGOOpt && IsPreLink;
1162 const bool IsPGOInstrGen =
1163 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRInstr;
1164 const bool IsPGOInstrUse =
1165 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRUse;
1166 const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty();
1167 // We don't want to mix pgo ctx gen and pgo gen; we also don't currently
1168 // enable ctx profiling from the frontend.
1169 assert(
1171 "Enabling both instrumented FDO and contextual instrumentation is not "
1172 "supported.");
1173 // Enable contextual profiling instrumentation.
1174 const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
1176
1177 if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen)
1178 addPreInlinerPasses(MPM, Level, Phase);
1179
1180 // Add all the requested passes for instrumentation PGO, if requested.
1181 if (IsPGOInstrGen || IsPGOInstrUse) {
1182 addPGOInstrPasses(MPM, Level,
1183 /*RunProfileGen=*/IsPGOInstrGen,
1184 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate,
1185 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1186 PGOOpt->FS);
1187 } else if (IsCtxProfGen) {
1189 addPostPGOLoopRotation(MPM, Level);
1191 }
1192
1193 if (IsPGOInstrGen || IsPGOInstrUse || IsCtxProfGen)
1194 MPM.addPass(PGOIndirectCallPromotion(false, false));
1195
1196 if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr)
1197 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile,
1199
1200 if (IsMemprofUse)
1201 MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS));
1202
1203 // Synthesize function entry counts for non-PGO compilation.
1204 if (EnableSyntheticCounts && !PGOOpt)
1206
1207 if (EnablePGOForceFunctionAttrs && PGOOpt)
1208 MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
1209
1210 MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
1211
1214 else
1216
1217 // Remove any dead arguments exposed by cleanups, constant folding globals,
1218 // and argument promotion.
1220
1222
1223 // Optimize globals now that functions are fully simplified.
1226
1227 return MPM;
1228}
1229
1230/// TODO: Should LTO cause any differences to this set of passes?
1231void PassBuilder::addVectorPasses(OptimizationLevel Level,
1232 FunctionPassManager &FPM, bool IsFullLTO) {
1235
1238 if (IsFullLTO) {
1239 // The vectorizer may have significantly shortened a loop body; unroll
1240 // again. Unroll small loops to hide loop backedge latency and saturate any
1241 // parallel execution resources of an out-of-order processor. We also then
1242 // need to clean up redundancies and loop invariant code.
1243 // FIXME: It would be really good to use a loop-integrated instruction
1244 // combiner for cleanup here so that the unrolling and LICM can be pipelined
1245 // across the loop nests.
1246 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
1249 LoopUnrollAndJamPass(Level.getSpeedupLevel())));
1251 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
1254 // Now that we are done with loop unrolling, be it either by LoopVectorizer,
1255 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have
1256 // become constant-offset, thus enabling SROA and alloca promotion. Do so.
1257 // NOTE: we are very late in the pipeline, and we don't have any LICM
1258 // or SimplifyCFG passes scheduled after us, that would cleanup
1259 // the CFG mess this may created if allowed to modify CFG, so forbid that.
1261 }
1262
1263 if (!IsFullLTO) {
1264 // Eliminate loads by forwarding stores from the previous iteration to loads
1265 // of the current iteration.
1267 }
1268 // Cleanup after the loop optimization passes.
1269 FPM.addPass(InstCombinePass());
1270
1271 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
1272 ExtraVectorPassManager ExtraPasses;
1273 // At higher optimization levels, try to clean up any runtime overlap and
1274 // alignment checks inserted by the vectorizer. We want to track correlated
1275 // runtime checks for two inner loops in the same outer loop, fold any
1276 // common computations, hoist loop-invariant aspects out of any outer loop,
1277 // and unswitch the runtime checks if possible. Once hoisted, we may have
1278 // dead (or speculatable) control flows or more combining opportunities.
1279 ExtraPasses.addPass(EarlyCSEPass());
1281 ExtraPasses.addPass(InstCombinePass());
1282 LoopPassManager LPM;
1284 /*AllowSpeculation=*/true));
1285 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
1287 ExtraPasses.addPass(
1288 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
1289 /*UseBlockFrequencyInfo=*/true));
1290 ExtraPasses.addPass(
1291 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true)));
1292 ExtraPasses.addPass(InstCombinePass());
1293 FPM.addPass(std::move(ExtraPasses));
1294 }
1295
1296 // Now that we've formed fast to execute loop structures, we do further
1297 // optimizations. These are run afterward as they might block doing complex
1298 // analyses and transforms such as what are needed for loop vectorization.
1299
1300 // Cleanup after loop vectorization, etc. Simplification passes like CVP and
1301 // GVN, loop transforms, and others have already run, so it's now better to
1302 // convert to more optimized IR using more aggressive simplify CFG options.
1303 // The extra sinking transform can create larger basic blocks, so do this
1304 // before SLP vectorization.
1306 .forwardSwitchCondToPhi(true)
1307 .convertSwitchRangeToICmp(true)
1308 .convertSwitchToLookupTable(true)
1309 .needCanonicalLoops(false)
1310 .hoistCommonInsts(true)
1311 .sinkCommonInsts(true)));
1312
1313 if (IsFullLTO) {
1314 FPM.addPass(SCCPPass());
1315 FPM.addPass(InstCombinePass());
1316 FPM.addPass(BDCEPass());
1317 }
1318
1319 // Optimize parallel scalar instruction chains into SIMD instructions.
1320 if (PTO.SLPVectorization) {
1322 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
1323 FPM.addPass(EarlyCSEPass());
1324 }
1325 }
1326 // Enhance/cleanup vector code.
1328
1329 if (!IsFullLTO) {
1330 FPM.addPass(InstCombinePass());
1331 // Unroll small loops to hide loop backedge latency and saturate any
1332 // parallel execution resources of an out-of-order processor. We also then
1333 // need to clean up redundancies and loop invariant code.
1334 // FIXME: It would be really good to use a loop-integrated instruction
1335 // combiner for cleanup here so that the unrolling and LICM can be pipelined
1336 // across the loop nests.
1337 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
1338 if (EnableUnrollAndJam && PTO.LoopUnrolling) {
1340 LoopUnrollAndJamPass(Level.getSpeedupLevel())));
1341 }
1343 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
1346 // Now that we are done with loop unrolling, be it either by LoopVectorizer,
1347 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have
1348 // become constant-offset, thus enabling SROA and alloca promotion. Do so.
1349 // NOTE: we are very late in the pipeline, and we don't have any LICM
1350 // or SimplifyCFG passes scheduled after us, that would cleanup
1351 // the CFG mess this may created if allowed to modify CFG, so forbid that.
1353 }
1354
1357 FPM.addPass(InstCombinePass());
1358
1359 // This is needed for two reasons:
1360 // 1. It works around problems that instcombine introduces, such as sinking
1361 // expensive FP divides into loops containing multiplications using the
1362 // divide result.
1363 // 2. It helps to clean up some loop-invariant code created by the loop
1364 // unroll pass when IsFullLTO=false.
1367 /*AllowSpeculation=*/true),
1368 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
1369
1370 // Now that we've vectorized and unrolled loops, we may have more refined
1371 // alignment information, try to re-derive it here.
1373}
1374
1377 ThinOrFullLTOPhase LTOPhase) {
1378 const bool LTOPreLink = isLTOPreLink(LTOPhase);
1380
1381 // Run partial inlining pass to partially inline functions that have
1382 // large bodies.
1385
1386 // Remove avail extern fns and globals definitions since we aren't compiling
1387 // an object file for later LTO. For LTO we want to preserve these so they
1388 // are eligible for inlining at link-time. Note if they are unreferenced they
1389 // will be removed by GlobalDCE later, so this only impacts referenced
1390 // available externally globals. Eventually they will be suppressed during
1391 // codegen, but eliminating here enables more opportunity for GlobalDCE as it
1392 // may make globals referenced by available external functions dead and saves
1393 // running remaining passes on the eliminated functions. These should be
1394 // preserved during prelinking for link-time inlining decisions.
1395 if (!LTOPreLink)
1397
1400
1401 // Do RPO function attribute inference across the module to forward-propagate
1402 // attributes where applicable.
1403 // FIXME: Is this really an optimization rather than a canonicalization?
1405
1406 // Do a post inline PGO instrumentation and use pass. This is a context
1407 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as
1408 // cross-module inline has not been done yet. The context sensitive
1409 // instrumentation is after all the inlines are done.
1410 if (!LTOPreLink && PGOOpt) {
1411 if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
1412 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true,
1413 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1414 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile,
1415 PGOOpt->FS);
1416 else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
1417 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false,
1418 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1419 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1420 PGOOpt->FS);
1421 }
1422
1423 // Re-compute GlobalsAA here prior to function passes. This is particularly
1424 // useful as the above will have inlined, DCE'ed, and function-attr
1425 // propagated everything. We should at this point have a reasonably minimal
1426 // and richly annotated call graph. By computing aliasing and mod/ref
1427 // information for all local globals here, the late loop passes and notably
1428 // the vectorizer will be able to use them to help recognize vectorizable
1429 // memory operations.
1432
1434
1435 FunctionPassManager OptimizePM;
1436 // Scheduling LoopVersioningLICM when inlining is over, because after that
1437 // we may see more accurate aliasing. Reason to run this late is that too
1438 // early versioning may prevent further inlining due to increase of code
1439 // size. Other optimizations which runs later might get benefit of no-alias
1440 // assumption in clone loop.
1442 OptimizePM.addPass(
1444 // LoopVersioningLICM pass might increase new LICM opportunities.
1447 /*AllowSpeculation=*/true),
1448 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
1449 }
1450
1451 OptimizePM.addPass(Float2IntPass());
1453
1454 if (EnableMatrix) {
1455 OptimizePM.addPass(LowerMatrixIntrinsicsPass());
1456 OptimizePM.addPass(EarlyCSEPass());
1457 }
1458
1459 // CHR pass should only be applied with the profile information.
1460 // The check is to check the profile summary information in CHR.
1461 if (EnableCHR && Level == OptimizationLevel::O3)
1462 OptimizePM.addPass(ControlHeightReductionPass());
1463
1464 // FIXME: We need to run some loop optimizations to re-rotate loops after
1465 // simplifycfg and others undo their rotation.
1466
1467 // Optimize the loop execution. These passes operate on entire loop nests
1468 // rather than on each loop in an inside-out manner, and so they are actually
1469 // function passes.
1470
1471 invokeVectorizerStartEPCallbacks(OptimizePM, Level);
1472
1473 LoopPassManager LPM;
1474 // First rotate loops that may have been un-rotated by prior passes.
1475 // Disable header duplication at -Oz.
1477 Level != OptimizationLevel::Oz,
1478 LTOPreLink));
1479 // Some loops may have become dead by now. Try to delete them.
1480 // FIXME: see discussion in https://reviews.llvm.org/D112851,
1481 // this may need to be revisited once we run GVN before loop deletion
1482 // in the simplification pipeline.
1485 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
1486
1487 // Distribute loops to allow partial vectorization. I.e. isolate dependences
1488 // into separate loop that would otherwise inhibit vectorization. This is
1489 // currently only performed for loops marked with the metadata
1490 // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
1491 OptimizePM.addPass(LoopDistributePass());
1492
1493 // Populates the VFABI attribute with the scalar-to-vector mappings
1494 // from the TargetLibraryInfo.
1495 OptimizePM.addPass(InjectTLIMappings());
1496
1497 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
1498
1499 // LoopSink pass sinks instructions hoisted by LICM, which serves as a
1500 // canonicalization pass that enables other optimizations. As a result,
1501 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
1502 // result too early.
1503 OptimizePM.addPass(LoopSinkPass());
1504
1505 // And finally clean up LCSSA form before generating code.
1506 OptimizePM.addPass(InstSimplifyPass());
1507
1508 // This hoists/decomposes div/rem ops. It should run after other sink/hoist
1509 // passes to avoid re-sinking, but before SimplifyCFG because it can allow
1510 // flattening of blocks.
1511 OptimizePM.addPass(DivRemPairsPass());
1512
1513 // Try to annotate calls that were created during optimization.
1514 OptimizePM.addPass(TailCallElimPass());
1515
1516 // LoopSink (and other loop passes since the last simplifyCFG) might have
1517 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
1519 .convertSwitchRangeToICmp(true)
1520 .speculateUnpredictables(true)));
1521
1522 // Add the core optimizing pipeline.
1523 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM),
1525
1527
1528 // Split out cold code. Splitting is done late to avoid hiding context from
1529 // other optimizations and inadvertently regressing performance. The tradeoff
1530 // is that this has a higher code size cost than splitting early.
1531 if (EnableHotColdSplit && !LTOPreLink)
1533
1534 // Search the code for similar regions of code. If enough similar regions can
1535 // be found where extracting the regions into their own function will decrease
1536 // the size of the program, we extract the regions, a deduplicate the
1537 // structurally similar regions.
1538 if (EnableIROutliner)
1540
1541 // Now we need to do some global optimization transforms.
1542 // FIXME: It would seem like these should come first in the optimization
1543 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird
1544 // ordering here.
1547
1548 // Merge functions if requested. It has a better chance to merge functions
1549 // after ConstantMerge folded jump tables.
1550 if (PTO.MergeFunctions)
1552
1553 if (PTO.CallGraphProfile && !LTOPreLink)
1556
1557 // TODO: Relative look table converter pass caused an issue when full lto is
1558 // enabled. See https://reviews.llvm.org/D94355 for more details.
1559 // Until the issue fixed, disable this pass during pre-linking phase.
1560 if (!LTOPreLink)
1562
1563 return MPM;
1564}
1565
1568 bool LTOPreLink) {
1569 if (Level == OptimizationLevel::O0)
1570 return buildO0DefaultPipeline(Level, LTOPreLink);
1571
1573
1574 // Convert @llvm.global.annotations to !annotation metadata.
1576
1577 // Force any function attributes we want the rest of the pipeline to observe.
1579
1580 if (PGOOpt && PGOOpt->DebugInfoForProfiling)
1582
1583 // Apply module pipeline start EP callback.
1585
1586 const ThinOrFullLTOPhase LTOPhase = LTOPreLink
1589 // Add the core simplification pipeline.
1591
1592 // Now add the optimization pipeline.
1594
1595 if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
1596 PGOOpt->Action == PGOOptions::SampleUse)
1598
1599 // Emit annotation remarks.
1601
1602 if (LTOPreLink)
1603 addRequiredLTOPreLinkPasses(MPM);
1604 return MPM;
1605}
1606
1609 bool EmitSummary) {
1611 if (ThinLTO)
1613 else
1615 MPM.addPass(EmbedBitcodePass(ThinLTO, EmitSummary));
1616
1617 // Use the ThinLTO post-link pipeline with sample profiling
1618 if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)
1619 MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr));
1620 else {
1621 // otherwise, just use module optimization
1622 MPM.addPass(
1624 // Emit annotation remarks.
1626 }
1627 return MPM;
1628}
1629
1632 if (Level == OptimizationLevel::O0)
1633 return buildO0DefaultPipeline(Level, /*LTOPreLink*/true);
1634
1636
1637 // Convert @llvm.global.annotations to !annotation metadata.
1639
1640 // Force any function attributes we want the rest of the pipeline to observe.
1642
1643 if (PGOOpt && PGOOpt->DebugInfoForProfiling)
1645
1646 // Apply module pipeline start EP callback.
1648
1649 // If we are planning to perform ThinLTO later, we don't bloat the code with
1650 // unrolling/vectorization/... now. Just simplify the module as much as we
1651 // can.
1654
1655 // Run partial inlining pass to partially inline functions that have
1656 // large bodies.
1657 // FIXME: It isn't clear whether this is really the right place to run this
1658 // in ThinLTO. Because there is another canonicalization and simplification
1659 // phase that will run after the thin link, running this here ends up with
1660 // less information than will be available later and it may grow functions in
1661 // ways that aren't beneficial.
1664
1665 if (PGOOpt && PGOOpt->PseudoProbeForProfiling &&
1666 PGOOpt->Action == PGOOptions::SampleUse)
1668
1669 // Handle Optimizer{Early,Last}EPCallbacks added by clang on PreLink. Actual
1670 // optimization is going to be done in PostLink stage, but clang can't add
1671 // callbacks there in case of in-process ThinLTO called by linker.
1674
1675 // Emit annotation remarks.
1677
1678 addRequiredLTOPreLinkPasses(MPM);
1679
1680 return MPM;
1681}
1682
1684 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) {
1686
1687 if (ImportSummary) {
1688 // For ThinLTO we must apply the context disambiguation decisions early, to
1689 // ensure we can correctly match the callsites to summary data.
1692
1693 // These passes import type identifier resolutions for whole-program
1694 // devirtualization and CFI. They must run early because other passes may
1695 // disturb the specific instruction patterns that these passes look for,
1696 // creating dependencies on resolutions that may not appear in the summary.
1697 //
1698 // For example, GVN may transform the pattern assume(type.test) appearing in
1699 // two basic blocks into assume(phi(type.test, type.test)), which would
1700 // transform a dependency on a WPD resolution into a dependency on a type
1701 // identifier resolution for CFI.
1702 //
1703 // Also, WPD has access to more precise information than ICP and can
1704 // devirtualize more effectively, so it should operate on the IR first.
1705 //
1706 // The WPD and LowerTypeTest passes need to run at -O0 to lower type
1707 // metadata and intrinsics.
1708 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary));
1709 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
1710 }
1711
1712 if (Level == OptimizationLevel::O0) {
1713 // Run a second time to clean up any type tests left behind by WPD for use
1714 // in ICP.
1715 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1716 // Drop available_externally and unreferenced globals. This is necessary
1717 // with ThinLTO in order to avoid leaving undefined references to dead
1718 // globals in the object file.
1721 return MPM;
1722 }
1723
1724 // Add the core simplification pipeline.
1727
1728 // Now add the optimization pipeline.
1731
1732 // Emit annotation remarks.
1734
1735 return MPM;
1736}
1737
1740 // FIXME: We should use a customized pre-link pipeline!
1741 return buildPerModuleDefaultPipeline(Level,
1742 /* LTOPreLink */ true);
1743}
1744
1747 ModuleSummaryIndex *ExportSummary) {
1749
1751
1752 // Create a function that performs CFI checks for cross-DSO calls with targets
1753 // in the current module.
1755
1756 if (Level == OptimizationLevel::O0) {
1757 // The WPD and LowerTypeTest passes need to run at -O0 to lower type
1758 // metadata and intrinsics.
1759 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
1760 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
1761 // Run a second time to clean up any type tests left behind by WPD for use
1762 // in ICP.
1763 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1764
1766
1767 // Emit annotation remarks.
1769
1770 return MPM;
1771 }
1772
1773 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) {
1774 // Load sample profile before running the LTO optimization pipeline.
1775 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
1776 PGOOpt->ProfileRemappingFile,
1778 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
1779 // RequireAnalysisPass for PSI before subsequent non-module passes.
1781 }
1782
1783 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present.
1785
1786 // Remove unused virtual tables to improve the quality of code generated by
1787 // whole-program devirtualization and bitset lowering.
1788 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
1789
1790 // Do basic inference of function attributes from known properties of system
1791 // libraries and other oracles.
1793
1794 if (Level.getSpeedupLevel() > 1) {
1797
1798 // Indirect call promotion. This should promote all the targets that are
1799 // left by the earlier promotion pass that promotes intra-module targets.
1800 // This two-step promotion is to save the compile time. For LTO, it should
1801 // produce the same result as if we only do promotion here.
1803 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse));
1804
1805 // Propagate constants at call sites into the functions they call. This
1806 // opens opportunities for globalopt (and inlining) by substituting function
1807 // pointers passed as arguments to direct uses of functions.
1808 MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/
1809 Level != OptimizationLevel::Os &&
1810 Level != OptimizationLevel::Oz)));
1811
1812 // Attach metadata to indirect call sites indicating the set of functions
1813 // they may target at run-time. This should follow IPSCCP.
1815 }
1816
1817 // Now deduce any function attributes based in the current code.
1818 MPM.addPass(
1820
1821 // Do RPO function attribute inference across the module to forward-propagate
1822 // attributes where applicable.
1823 // FIXME: Is this really an optimization rather than a canonicalization?
1825
1826 // Use in-range annotations on GEP indices to split globals where beneficial.
1828
1829 // Run whole program optimization of virtual call when the list of callees
1830 // is fixed.
1831 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
1832
1833 // Stop here at -O1.
1834 if (Level == OptimizationLevel::O1) {
1835 // The LowerTypeTestsPass needs to run to lower type metadata and the
1836 // type.test intrinsics. The pass does nothing if CFI is disabled.
1837 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
1838 // Run a second time to clean up any type tests left behind by WPD for use
1839 // in ICP (which is performed earlier than this in the regular LTO
1840 // pipeline).
1841 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
1842
1844
1845 // Emit annotation remarks.
1847
1848 return MPM;
1849 }
1850
1851 // Optimize globals to try and fold them into constants.
1853
1854 // Promote any localized globals to SSA registers.
1856
1857 // Linking modules together can lead to duplicate global constant, only
1858 // keep one copy of each constant.
1860
1861 // Remove unused arguments from functions.
1863
1864 // Reduce the code after globalopt and ipsccp. Both can open up significant
1865 // simplification opportunities, and both can propagate functions through
1866 // function pointers. When this happens, we often have to resolve varargs
1867 // calls, etc, so let instcombine do this.
1868 FunctionPassManager PeepholeFPM;
1869 PeepholeFPM.addPass(InstCombinePass());
1870 if (Level.getSpeedupLevel() > 1)
1871 PeepholeFPM.addPass(AggressiveInstCombinePass());
1872 invokePeepholeEPCallbacks(PeepholeFPM, Level);
1873
1874 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM),
1876
1877 // Note: historically, the PruneEH pass was run first to deduce nounwind and
1878 // generally clean up exception handling overhead. It isn't clear this is
1879 // valuable as the inliner doesn't currently care whether it is inlining an
1880 // invoke or a call.
1881 // Run the inliner now.
1882 if (EnableModuleInliner) {
1886 } else {
1889 /* MandatoryFirst */ true,
1892 }
1893
1894 // Perform context disambiguation after inlining, since that would reduce the
1895 // amount of additional cloning required to distinguish the allocation
1896 // contexts.
1899
1900 // Optimize globals again after we ran the inliner.
1902
1903 // Run the OpenMPOpt pass again after global optimizations.
1905
1906 // Garbage collect dead functions.
1907 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
1908
1909 // If we didn't decide to inline a function, check to see if we can
1910 // transform it to pass arguments by value instead of by reference.
1912
1914 // The IPO Passes may leave cruft around. Clean up after them.
1915 FPM.addPass(InstCombinePass());
1916 invokePeepholeEPCallbacks(FPM, Level);
1917
1920
1922
1923 // Do a post inline PGO instrumentation and use pass. This is a context
1924 // sensitive PGO pass.
1925 if (PGOOpt) {
1926 if (PGOOpt->CSAction == PGOOptions::CSIRInstr)
1927 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true,
1928 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1929 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile,
1930 PGOOpt->FS);
1931 else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
1932 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false,
1933 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
1934 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1935 PGOOpt->FS);
1936 }
1937
1938 // Break up allocas
1940
1941 // LTO provides additional opportunities for tailcall elimination due to
1942 // link-time inlining, and visibility of nocapture attribute.
1944
1945 // Run a few AA driver optimizations here and now to cleanup the code.
1948
1949 MPM.addPass(
1951
1952 // Require the GlobalsAA analysis for the module so we can query it within
1953 // MainFPM.
1956 // Invalidate AAManager so it can be recreated and pick up the newly
1957 // available GlobalsAA.
1958 MPM.addPass(
1960 }
1961
1962 FunctionPassManager MainFPM;
1965 /*AllowSpeculation=*/true),
1966 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false));
1967
1968 if (RunNewGVN)
1969 MainFPM.addPass(NewGVNPass());
1970 else
1971 MainFPM.addPass(GVNPass());
1972
1973 // Remove dead memcpy()'s.
1974 MainFPM.addPass(MemCpyOptPass());
1975
1976 // Nuke dead stores.
1977 MainFPM.addPass(DSEPass());
1978 MainFPM.addPass(MoveAutoInitPass());
1980
1981 LoopPassManager LPM;
1982 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
1983 LPM.addPass(LoopFlattenPass());
1986 // FIXME: Add loop interchange.
1987
1988 // Unroll small loops and perform peeling.
1989 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
1990 /* OnlyWhenForced= */ !PTO.LoopUnrolling,
1992 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA.
1993 // *All* loop passes must preserve it, in order to be able to use it.
1995 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
1996
1997 MainFPM.addPass(LoopDistributePass());
1998
1999 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true);
2000
2001 // Run the OpenMPOpt CGSCC pass again late.
2004
2005 invokePeepholeEPCallbacks(MainFPM, Level);
2006 MainFPM.addPass(JumpThreadingPass());
2009
2010 // Lower type metadata and the type.test intrinsic. This pass supports
2011 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
2012 // to be run at link time if CFI is enabled. This pass does nothing if
2013 // CFI is disabled.
2014 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
2015 // Run a second time to clean up any type tests left behind by WPD for use
2016 // in ICP (which is performed earlier than this in the regular LTO pipeline).
2017 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
2018
2019 // Enable splitting late in the FullLTO post-link pipeline.
2022
2023 // Add late LTO optimization passes.
2024 FunctionPassManager LateFPM;
2025
2026 // LoopSink pass sinks instructions hoisted by LICM, which serves as a
2027 // canonicalization pass that enables other optimizations. As a result,
2028 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
2029 // result too early.
2030 LateFPM.addPass(LoopSinkPass());
2031
2032 // This hoists/decomposes div/rem ops. It should run after other sink/hoist
2033 // passes to avoid re-sinking, but before SimplifyCFG because it can allow
2034 // flattening of blocks.
2035 LateFPM.addPass(DivRemPairsPass());
2036
2037 // Delete basic blocks, which optimization passes may have killed.
2039 .convertSwitchRangeToICmp(true)
2040 .hoistCommonInsts(true)
2041 .speculateUnpredictables(true)));
2042 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
2043
2044 // Drop bodies of available eternally objects to improve GlobalDCE.
2046
2047 // Now that we have optimized the program, discard unreachable functions.
2048 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true));
2049
2050 if (PTO.MergeFunctions)
2052
2053 if (PTO.CallGraphProfile)
2054 MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true));
2055
2057
2058 // Emit annotation remarks.
2060
2061 return MPM;
2062}
2063
2065 bool LTOPreLink) {
2066 assert(Level == OptimizationLevel::O0 &&
2067 "buildO0DefaultPipeline should only be used with O0");
2068
2070
2071 // Perform pseudo probe instrumentation in O0 mode. This is for the
2072 // consistency between different build modes. For example, a LTO build can be
2073 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in
2074 // the postlink will require pseudo probe instrumentation in the prelink.
2075 if (PGOOpt && PGOOpt->PseudoProbeForProfiling)
2077
2078 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr ||
2079 PGOOpt->Action == PGOOptions::IRUse))
2081 MPM,
2082 /*RunProfileGen=*/(PGOOpt->Action == PGOOptions::IRInstr),
2083 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, PGOOpt->ProfileFile,
2084 PGOOpt->ProfileRemappingFile, PGOOpt->FS);
2085
2086 // Instrument function entry and exit before all inlining.
2088 EntryExitInstrumenterPass(/*PostInlining=*/false)));
2089
2091
2092 if (PGOOpt && PGOOpt->DebugInfoForProfiling)
2094
2096
2097 // Build a minimal pipeline based on the semantics required by LLVM,
2098 // which is just that always inlining occurs. Further, disable generating
2099 // lifetime intrinsics to avoid enabling further optimizations during
2100 // code generation.
2102 /*InsertLifetimeIntrinsics=*/false));
2103
2104 if (PTO.MergeFunctions)
2106
2107 if (EnableMatrix)
2108 MPM.addPass(
2110
2111 if (!CGSCCOptimizerLateEPCallbacks.empty()) {
2112 CGSCCPassManager CGPM;
2114 if (!CGPM.isEmpty())
2116 }
2117 if (!LateLoopOptimizationsEPCallbacks.empty()) {
2118 LoopPassManager LPM;
2120 if (!LPM.isEmpty()) {
2122 createFunctionToLoopPassAdaptor(std::move(LPM))));
2123 }
2124 }
2125 if (!LoopOptimizerEndEPCallbacks.empty()) {
2126 LoopPassManager LPM;
2128 if (!LPM.isEmpty()) {
2130 createFunctionToLoopPassAdaptor(std::move(LPM))));
2131 }
2132 }
2133 if (!ScalarOptimizerLateEPCallbacks.empty()) {
2136 if (!FPM.isEmpty())
2138 }
2139
2141
2142 if (!VectorizerStartEPCallbacks.empty()) {
2145 if (!FPM.isEmpty())
2147 }
2148
2149 ModulePassManager CoroPM;
2150 CoroPM.addPass(CoroEarlyPass());
2151 CGSCCPassManager CGPM;
2152 CGPM.addPass(CoroSplitPass());
2153 CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
2154 CoroPM.addPass(CoroCleanupPass());
2155 CoroPM.addPass(GlobalDCEPass());
2156 MPM.addPass(CoroConditionalWrapper(std::move(CoroPM)));
2157
2159
2160 if (LTOPreLink)
2161 addRequiredLTOPreLinkPasses(MPM);
2162
2164
2165 return MPM;
2166}
2167
2169 AAManager AA;
2170
2171 // The order in which these are registered determines their priority when
2172 // being queried.
2173
2174 // First we register the basic alias analysis that provides the majority of
2175 // per-function local AA logic. This is a stateless, on-demand local set of
2176 // AA techniques.
2178
2179 // Next we query fast, specialized alias analyses that wrap IR-embedded
2180 // information about aliasing.
2183
2184 // Add support for querying global aliasing information when available.
2185 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module
2186 // analysis, all that the `AAManager` can do is query for any *cached*
2187 // results from `GlobalsAA` through a readonly proxy.
2190
2191 // Add target-specific alias analyses.
2192 if (TM)
2194
2195 return AA;
2196}
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
AggressiveInstCombiner - Combine expression patterns to form expressions with fewer,...
Provides passes to inlining "always_inline" functions.
This is the interface for LLVM's primary stateless and local alias analysis.
This file provides the interface for LLVM's Call Graph Profile pass.
This header provides classes for managing passes over SCCs of the call graph.
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Definition: CommandLine.h:686
This file provides the interface for a simple, fast CSE pass.
This file provides a pass which clones the current module and runs the provided pass pipeline on the ...
Super simple passes to force specific function attrs from the commandline into the IR for debugging p...
Provides passes for computing function attributes based on interprocedural analyses.
This file provides the interface for LLVM's Global Value Numbering pass which eliminates fully redund...
This is the interface for a simple mod/ref and alias analysis over globals.
AcceleratorCodeSelection - Identify all functions reachable from a kernel, removing those that are un...
Interfaces for passes which infer implicit function attributes from the name and signature of functio...
This file provides the primary interface to the instcombine pass.
Defines passes for running instruction simplification across chunks of IR.
This file provides the interface for LLVM's PGO Instrumentation lowering pass.
See the comments on JumpThreadingPass.
static LVOptions Options
Definition: LVOptions.cpp:25
This header defines the LoopLoadEliminationPass object.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
The header file for the LowerConstantIntrinsics pass as used by the new pass manager.
The header file for the LowerExpectIntrinsic pass as used by the new pass manager.
This pass performs merges of loads and stores on both sides of a.
This file provides the interface for LLVM's Global Value Numbering pass.
This header enumerates the LLVM-provided high-level optimization levels.
This file provides the interface for IR based instrumentation passes ( (profile-gen,...
Define option tunables for PGO.
ModulePassManager MPM
static cl::opt< bool > EnableMergeFunctions("enable-merge-functions", cl::init(false), cl::Hidden, cl::desc("Enable function merging as part of the optimization pipeline"))
static cl::opt< bool > EnableGlobalAnalyses("enable-global-analyses", cl::init(true), cl::Hidden, cl::desc("Enable inter-procedural analyses"))
static cl::opt< bool > EnableIROutliner("ir-outliner", cl::init(false), cl::Hidden, cl::desc("Enable ir outliner pass"))
static cl::opt< bool > RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, cl::desc("Run the NewGVN pass"))
static cl::opt< bool > DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden, cl::desc("Disable pre-instrumentation inliner"))
static cl::opt< bool > EnableEagerlyInvalidateAnalyses("eagerly-invalidate-analyses", cl::init(true), cl::Hidden, cl::desc("Eagerly invalidate more analyses in default pipelines"))
static cl::opt< bool > ExtraVectorizerPasses("extra-vectorizer-passes", cl::init(false), cl::Hidden, cl::desc("Run cleanup optimization passes after vectorization"))
static void addAnnotationRemarksPass(ModulePassManager &MPM)
static cl::opt< bool > EnablePostPGOLoopRotation("enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden, cl::desc("Run the loop rotation transformation after PGO instrumentation"))
static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level)
static cl::opt< bool > EnableGVNSink("enable-gvn-sink", cl::desc("Enable the GVN sinking pass (default = off)"))
static cl::opt< bool > PerformMandatoryInliningsFirst("mandatory-inlining-first", cl::init(false), cl::Hidden, cl::desc("Perform mandatory inlinings module-wide, before performing " "inlining"))
static cl::opt< bool > RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Run Partial inlinining pass"))
static cl::opt< bool > EnableGVNHoist("enable-gvn-hoist", cl::desc("Enable the GVN hoisting pass (default = off)"))
static cl::opt< bool > EnableDFAJumpThreading("enable-dfa-jump-thread", cl::desc("Enable DFA jump threading"), cl::init(false), cl::Hidden)
static cl::opt< bool > EnableCHR("enable-chr", cl::init(true), cl::Hidden, cl::desc("Enable control height reduction optimization (CHR)"))
static cl::opt< bool > EnableHotColdSplit("hot-cold-split", cl::desc("Enable hot-cold splitting pass"))
static cl::opt< bool > EnableLoopInterchange("enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the experimental LoopInterchange Pass"))
static cl::opt< bool > EnableSampledInstr("enable-sampled-instrumentation", cl::init(false), cl::Hidden, cl::desc("Enable profile instrumentation sampling (default = off)"))
static cl::opt< int > PreInlineThreshold("preinline-threshold", cl::Hidden, cl::init(75), cl::desc("Control the amount of inlining in pre-instrumentation inliner " "(default = 75)"))
static cl::opt< bool > EnableLoopHeaderDuplication("enable-loop-header-duplication", cl::init(false), cl::Hidden, cl::desc("Enable loop header duplication at any optimization level"))
static cl::opt< bool > EnablePGOForceFunctionAttrs("enable-pgo-force-function-attrs", cl::desc("Enable pass to set function attributes based on PGO profiles"), cl::init(false))
static cl::opt< bool > EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false), cl::Hidden, cl::desc("Enable Unroll And Jam Pass"))
static cl::opt< bool > EnableModuleInliner("enable-module-inliner", cl::init(false), cl::Hidden, cl::desc("Enable module inliner"))
static cl::opt< bool > EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, cl::desc("Enable lowering of the matrix intrinsics"))
static cl::opt< AttributorRunOption > AttributorRun("attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE), cl::desc("Enable the attributor inter-procedural deduction pass"), cl::values(clEnumValN(AttributorRunOption::ALL, "all", "enable all attributor runs"), clEnumValN(AttributorRunOption::MODULE, "module", "enable module-wide attributor runs"), clEnumValN(AttributorRunOption::CGSCC, "cgscc", "enable call graph SCC attributor runs"), clEnumValN(AttributorRunOption::NONE, "none", "disable attributor runs")))
static cl::opt< bool > EnableOrderFileInstrumentation("enable-order-file-instrumentation", cl::init(false), cl::Hidden, cl::desc("Enable order file instrumentation (default = off)"))
static cl::opt< bool > UseLoopVersioningLICM("enable-loop-versioning-licm", cl::init(false), cl::Hidden, cl::desc("Enable the experimental Loop Versioning LICM pass"))
static cl::opt< bool > EnableSyntheticCounts("enable-npm-synthetic-counts", cl::Hidden, cl::desc("Run synthetic function entry count generation " "pass"))
static bool isLTOPreLink(ThinOrFullLTOPhase Phase)
static cl::opt< bool > EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), cl::Hidden, cl::desc("Enable inline deferral during PGO"))
Flag to enable inline deferral during PGO.
static cl::opt< bool > EnableJumpTableToSwitch("enable-jump-table-to-switch", cl::desc("Enable JumpTableToSwitch pass (default = off)"))
static cl::opt< InliningAdvisorMode > UseInlineAdvisor("enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), cl::values(clEnumValN(InliningAdvisorMode::Default, "default", "Heuristics-based inliner version"), clEnumValN(InliningAdvisorMode::Development, "development", "Use development mode (runtime-loadable model)"), clEnumValN(InliningAdvisorMode::Release, "release", "Use release mode (AOT-compiled model)")))
static cl::opt< bool > FlattenedProfileUsed("flattened-profile-used", cl::init(false), cl::Hidden, cl::desc("Indicate the sample profile being used is flattened, i.e., " "no inline hierachy exists in the profile"))
static cl::opt< bool > EnableConstraintElimination("enable-constraint-elimination", cl::init(true), cl::Hidden, cl::desc("Enable pass to eliminate conditions based on linear constraints"))
static cl::opt< bool > EnableLoopFlatten("enable-loop-flatten", cl::init(false), cl::Hidden, cl::desc("Enable the LoopFlatten Pass"))
This header defines various interfaces for pass management in LLVM.
This file implements relative lookup table converter that converts lookup tables to relative lookup t...
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file provides the interface for LLVM's Scalar Replacement of Aggregates pass.
This file provides the interface for the pseudo probe implementation for AutoFDO.
This file provides the interface for the sampled PGO loader pass.
This is the interface for a metadata-based scoped no-alias analysis.
This file provides the interface for the pass responsible for both simplifying and canonicalizing the...
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This is the interface for a metadata-based TBAA.
Defines the virtual file system interface vfs::FileSystem.
A manager for alias analyses.
void registerFunctionAnalysis()
Register a specific AA result.
void registerModuleAnalysis()
Register a specific AA result.
Inlines functions marked as "always_inline".
Definition: AlwaysInliner.h:32
Argument promotion pass.
Analysis pass providing a never-invalidated alias analysis result.
Simple pass that canonicalizes aliases.
A pass that merges duplicate global constants into a single constant.
Definition: ConstantMerge.h:29
This class implements a trivial dead store elimination.
Eliminate dead arguments (and return values) from functions.
A pass that transforms external global definitions into declarations.
Pass embeds a copy of the module optimized with the provided pass pipeline into a global variable.
The core GVN pass object.
Definition: GVN.h:117
Pass to remove unused function declarations.
Definition: GlobalDCE.h:36
Optimize globals that never have their address taken.
Definition: GlobalOpt.h:25
Pass to perform split of global variables.
Definition: GlobalSplit.h:26
Analysis pass providing a never-invalidated alias analysis result.
Pass to outline cold regions.
Pass to perform interprocedural constant propagation.
Definition: SCCP.h:48
Pass to outline similar regions.
Definition: IROutliner.h:444
Run instruction simplification across each instruction in the function.
The instrumentation pass for recording function order.
Instrumentation based profiling lowering pass.
A smart pointer to a reference-counted object that inherits from RefCountedBase or ThreadSafeRefCount...
This pass performs 'jump threading', which looks at blocks that have multiple predecessors and multip...
Definition: JumpThreading.h:79
Performs Loop Invariant Code Motion Pass.
Definition: LICM.h:66
Loop unroll pass that only does full loop unrolling and peeling.
Performs Loop Idiom Recognize Pass.
Performs Loop Inst Simplify Pass.
A simple loop rotation transformation.
Definition: LoopRotation.h:24
Performs basic CFG simplifications to assist other loop passes.
A pass that does profile-guided sinking of instructions into loops.
Definition: LoopSink.h:33
A simple loop rotation transformation.
Loop unroll pass that will support both full and partial unrolling.
Merge identical functions.
The module inliner pass for the new pass manager.
Definition: ModuleInliner.h:27
Module pass, wrapping the inliner pass.
Definition: Inliner.h:62
void addModulePass(T Pass)
Add a module pass that runs before the CGSCC passes.
Definition: Inliner.h:78
Class to hold module path string table and global value map, and encapsulate methods for operating on...
Simple pass that provides a name to every anonymous globals.
OpenMP optimizations pass.
Definition: OpenMPOpt.h:42
static const OptimizationLevel O3
Optimize for fast execution as much as possible.
static const OptimizationLevel Oz
A very specialized mode that will optimize for code size at any and all costs.
static const OptimizationLevel O0
Disable as many optimizations as possible.
static const OptimizationLevel Os
Similar to O2 but tries to optimize for small code size instead of fast execution without triggering ...
static const OptimizationLevel O2
Optimize for fast execution as much as possible without triggering significant incremental compile ti...
static const OptimizationLevel O1
Optimize quickly without destroying debuggability.
The indirect function call promotion pass.
The instrumentation (profile-instr-gen) pass for IR based PGO.
The instrumentation (profile-instr-gen) pass for IR based PGO.
The profile annotation (profile-instr-use) pass for IR based PGO.
The profile size based optimization pass for memory intrinsics.
Pass to remove unused function declarations.
ModulePassManager buildO0DefaultPipeline(OptimizationLevel Level, bool LTOPreLink=false)
Build an O0 pipeline with the minimal semantically required passes.
void invokeFullLinkTimeOptimizationLastEPCallbacks(ModulePassManager &MPM, OptimizationLevel Level)
ModuleInlinerWrapperPass buildInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase)
Construct the module pipeline that performs inlining as well as the inlining-driven cleanups.
void invokeOptimizerLastEPCallbacks(ModulePassManager &MPM, OptimizationLevel Level)
void invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM, OptimizationLevel Level)
AAManager buildDefaultAAPipeline()
Build the default AAManager with the default alias analysis pipeline registered.
void invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM, OptimizationLevel Level)
ModulePassManager buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level)
Build a pre-link, ThinLTO-targeting default optimization pipeline to a pass manager.
void invokeScalarOptimizerLateEPCallbacks(FunctionPassManager &FPM, OptimizationLevel Level)
ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level, bool LTOPreLink=false)
Build a per-module default optimization pipeline.
void invokePipelineStartEPCallbacks(ModulePassManager &MPM, OptimizationLevel Level)
FunctionPassManager buildFunctionSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase)
Construct the core LLVM function canonicalization and simplification pipeline.
void invokePeepholeEPCallbacks(FunctionPassManager &FPM, OptimizationLevel Level)
void invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM, OptimizationLevel Level)
ModulePassManager buildLTODefaultPipeline(OptimizationLevel Level, ModuleSummaryIndex *ExportSummary)
Build an LTO default optimization pipeline to a pass manager.
ModulePassManager buildModuleInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase)
Construct the module pipeline that performs inlining with module inliner pass.
ModulePassManager buildThinLTODefaultPipeline(OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary)
Build a ThinLTO default optimization pipeline to a pass manager.
void invokeLateLoopOptimizationsEPCallbacks(LoopPassManager &LPM, OptimizationLevel Level)
void invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM, OptimizationLevel Level)
void invokePipelineEarlySimplificationEPCallbacks(ModulePassManager &MPM, OptimizationLevel Level)
void invokeFullLinkTimeOptimizationEarlyEPCallbacks(ModulePassManager &MPM, OptimizationLevel Level)
ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO, bool EmitSummary)
Build a fat object default optimization pipeline.
ModulePassManager buildModuleSimplificationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase)
Construct the core LLVM module canonicalization and simplification pipeline.
ModulePassManager buildModuleOptimizationPipeline(OptimizationLevel Level, ThinOrFullLTOPhase LTOPhase)
Construct the core LLVM module optimization pipeline.
void addPGOInstrPassesForO0(ModulePassManager &MPM, bool RunProfileGen, bool IsCS, bool AtomicCounterUpdate, std::string ProfileFile, std::string ProfileRemappingFile, IntrusiveRefCntPtr< vfs::FileSystem > FS)
Add PGOInstrumenation passes for O0 only.
ModulePassManager buildLTOPreLinkDefaultPipeline(OptimizationLevel Level)
Build a pre-link, LTO-targeting default optimization pipeline to a pass manager.
LLVM_ATTRIBUTE_MINSIZE std::enable_if_t< is_detected< HasRunOnLoopT, PassT >::value > addPass(PassT &&Pass)
LLVM_ATTRIBUTE_MINSIZE std::enable_if_t<!std::is_same_v< PassT, PassManager > > addPass(PassT &&Pass)
Definition: PassManager.h:195
bool isEmpty() const
Returns if the pass manager contains any passes.
Definition: PassManager.h:217
unsigned LicmMssaNoAccForPromotionCap
Tuning option to disable promotion to scalars in LICM with MemorySSA, if the number of access is too ...
Definition: PassBuilder.h:74
bool SLPVectorization
Tuning option to enable/disable slp loop vectorization, set based on opt level.
Definition: PassBuilder.h:59
int InlinerThreshold
Tuning option to override the default inliner threshold.
Definition: PassBuilder.h:88
bool CallGraphProfile
Tuning option to enable/disable call graph profile.
Definition: PassBuilder.h:78
bool MergeFunctions
Tuning option to enable/disable function merging.
Definition: PassBuilder.h:85
bool ForgetAllSCEVInLoopUnroll
Tuning option to forget all SCEV loops in LoopUnroll.
Definition: PassBuilder.h:66
unsigned LicmMssaOptCap
Tuning option to cap the number of calls to retrive clobbering accesses in MemorySSA,...
Definition: PassBuilder.h:70
bool LoopInterleaving
Tuning option to set loop interleaving on/off, set based on opt level.
Definition: PassBuilder.h:51
PipelineTuningOptions()
Constructor sets pipeline tuning defaults based on cl::opts.
bool LoopUnrolling
Tuning option to enable/disable loop unrolling. Its default value is true.
Definition: PassBuilder.h:62
bool LoopVectorization
Tuning option to enable/disable loop vectorization, set based on opt level.
Definition: PassBuilder.h:55
Reassociate commutative expressions.
Definition: Reassociate.h:85
A pass to do RPO deduction and propagation of function attributes.
Definition: FunctionAttrs.h:73
This pass performs function-level constant propagation and merging.
Definition: SCCP.h:29
The sample profiler data loader pass.
Definition: SampleProfile.h:39
Analysis pass providing a never-invalidated alias analysis result.
This pass transforms loops that contain branches or switches on loop- invariant conditions to have mu...
A pass to simplify and canonicalize the CFG of a function.
Definition: SimplifyCFG.h:29
virtual void registerDefaultAliasAnalyses(AAManager &)
Allow the target to register alias analyses with the AAManager for use with the new pass manager.
Analysis pass providing a never-invalidated alias analysis result.
Optimize scalar/vector interactions in IR using target cost models.
Definition: VectorCombine.h:23
Interfaces for registering analysis passes, producing common pass manager configurations,...
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:711
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
cl::opt< bool > EnableKnowledgeRetention
ModuleToFunctionPassAdaptor createModuleToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
Definition: PassManager.h:848
@ MODULE
Definition: Attributor.h:6418
@ CGSCC
Definition: Attributor.h:6419
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
Definition: Pass.h:76
@ FullLTOPreLink
Full LTO prelink phase.
@ ThinLTOPostLink
ThinLTO postlink (backend compile) phase.
@ None
No LTO/ThinLTO behavior needed.
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
ModuleToPostOrderCGSCCPassAdaptor createModuleToPostOrderCGSCCPassAdaptor(CGSCCPassT &&Pass)
A function to deduce a function pass type and wrap it in the templated adaptor.
CGSCCToFunctionPassAdaptor createCGSCCToFunctionPassAdaptor(FunctionPassT &&Pass, bool EagerlyInvalidate=false, bool NoRerun=false)
A function to deduce a function pass type and wrap it in the templated adaptor.
cl::opt< bool > ForgetSCEVInLoopUnroll
bool AreStatisticsEnabled()
Check if statistics are enabled.
Definition: Statistic.cpp:139
cl::opt< bool > EnableInferAlignmentPass
cl::opt< bool > EnableMemProfContextDisambiguation
Enable MemProf context disambiguation for thin link.
InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
cl::opt< unsigned > SetLicmMssaNoAccForPromotionCap
std::enable_if_t< is_detected< HasRunOnLoopT, LoopPassT >::value, FunctionToLoopPassAdaptor > createFunctionToLoopPassAdaptor(LoopPassT &&Pass, bool UseMemorySSA=false, bool UseBlockFrequencyInfo=false, bool UseBranchProbabilityInfo=false)
A function to deduce a loop pass type and wrap it in the templated adaptor.
cl::opt< unsigned > MaxDevirtIterations("max-devirt-iterations", cl::ReallyHidden, cl::init(4))
cl::opt< unsigned > SetLicmMssaOptCap
A DCE pass that assumes instructions are dead until proven otherwise.
Definition: ADCE.h:31
Pass to convert @llvm.global.annotations to !annotation metadata.
This pass attempts to minimize the number of assume without loosing any information.
Hoist/decompose integer division and remainder instructions to enable CFG improvements and better cod...
Definition: DivRemPairs.h:23
A simple and fast domtree-based CSE pass.
Definition: EarlyCSE.h:30
A pass manager to run a set of extra function simplification passes after vectorization,...
Pass which forces specific function attributes into the IR, primarily as a debugging tool.
A simple and fast domtree-based GVN pass to hoist common expressions from sibling branches.
Definition: GVN.h:392
Uses an "inverted" value numbering to decide the similarity of expressions and sinks similar expressi...
Definition: GVN.h:399
A set of parameters to control various transforms performed by IPSCCP pass.
Definition: SCCP.h:35
A pass which infers function attributes from the names and signatures of function declarations in a m...
Provides context on when an inline advisor is constructed in the pipeline (e.g., link phase,...
Definition: InlineAdvisor.h:59
Thresholds to tune inline cost analysis.
Definition: InlineCost.h:206
std::optional< int > HotCallSiteThreshold
Threshold to use when the callsite is considered hot.
Definition: InlineCost.h:223
int DefaultThreshold
The default threshold to start with for a callee.
Definition: InlineCost.h:208
std::optional< bool > EnableDeferral
Indicate whether we should allow inline deferral.
Definition: InlineCost.h:236
std::optional< int > HintThreshold
Threshold to use for callees with inline hint.
Definition: InlineCost.h:211
Options for the frontend instrumentation based profiling pass.
A no-op pass template which simply forces a specific analysis result to be invalidated.
Definition: PassManager.h:901
Pass to forward loads in a loop around the backedge to subsequent iterations.
A set of parameters used to control various transforms performed by the LoopUnroll pass.
The LoopVectorize Pass.
Computes function attributes in post-order over the call graph.
Definition: FunctionAttrs.h:49
A utility pass template to force an analysis result to be available.
Definition: PassManager.h:874