LLVM  16.0.0git
SelectOptimize.cpp
Go to the documentation of this file.
1 //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass converts selects to conditional jumps when profitable.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/ADT/Optional.h"
14 #include "llvm/ADT/SmallVector.h"
15 #include "llvm/ADT/Statistic.h"
18 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/CodeGen/Passes.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Function.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/ProfDataUtils.h"
33 #include "llvm/InitializePasses.h"
34 #include "llvm/Pass.h"
38 #include <algorithm>
39 #include <memory>
40 #include <queue>
41 #include <stack>
42 #include <string>
43 
44 using namespace llvm;
45 
46 #define DEBUG_TYPE "select-optimize"
47 
48 STATISTIC(NumSelectOptAnalyzed,
49  "Number of select groups considered for conversion to branch");
50 STATISTIC(NumSelectConvertedExpColdOperand,
51  "Number of select groups converted due to expensive cold operand");
52 STATISTIC(NumSelectConvertedHighPred,
53  "Number of select groups converted due to high-predictability");
54 STATISTIC(NumSelectUnPred,
55  "Number of select groups not converted due to unpredictability");
56 STATISTIC(NumSelectColdBB,
57  "Number of select groups not converted due to cold basic block");
58 STATISTIC(NumSelectConvertedLoop,
59  "Number of select groups converted due to loop-level analysis");
60 STATISTIC(NumSelectsConverted, "Number of selects converted");
61 
63  "cold-operand-threshold",
64  cl::desc("Maximum frequency of path for an operand to be considered cold."),
65  cl::init(20), cl::Hidden);
66 
68  "cold-operand-max-cost-multiplier",
69  cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
70  "slice of a cold operand to be considered inexpensive."),
71  cl::init(1), cl::Hidden);
72 
73 static cl::opt<unsigned>
74  GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
75  cl::desc("Gradient gain threshold (%)."),
76  cl::init(25), cl::Hidden);
77 
78 static cl::opt<unsigned>
79  GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
80  cl::desc("Minimum gain per loop (in cycles) threshold."),
81  cl::init(4), cl::Hidden);
82 
84  "select-opti-loop-relative-gain-threshold",
85  cl::desc(
86  "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
87  cl::init(8), cl::Hidden);
88 
90  "mispredict-default-rate", cl::Hidden, cl::init(25),
91  cl::desc("Default mispredict rate (initialized to 25%)."));
92 
93 static cl::opt<bool>
94  DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
95  cl::init(false),
96  cl::desc("Disable loop-level heuristics."));
97 
98 namespace {
99 
100 class SelectOptimize : public FunctionPass {
101  const TargetMachine *TM = nullptr;
102  const TargetSubtargetInfo *TSI;
103  const TargetLowering *TLI = nullptr;
104  const TargetTransformInfo *TTI = nullptr;
105  const LoopInfo *LI;
106  DominatorTree *DT;
107  std::unique_ptr<BlockFrequencyInfo> BFI;
108  std::unique_ptr<BranchProbabilityInfo> BPI;
109  ProfileSummaryInfo *PSI;
111  TargetSchedModel TSchedModel;
112 
113 public:
114  static char ID;
115 
116  SelectOptimize() : FunctionPass(ID) {
118  }
119 
120  bool runOnFunction(Function &F) override;
121 
122  void getAnalysisUsage(AnalysisUsage &AU) const override {
129  }
130 
131 private:
132  // Select groups consist of consecutive select instructions with the same
133  // condition.
134  using SelectGroup = SmallVector<SelectInst *, 2>;
135  using SelectGroups = SmallVector<SelectGroup, 2>;
136 
138 
139  struct CostInfo {
140  /// Predicated cost (with selects as conditional moves).
141  Scaled64 PredCost;
142  /// Non-predicated cost (with selects converted to branches).
143  Scaled64 NonPredCost;
144  };
145 
146  // Converts select instructions of a function to conditional jumps when deemed
147  // profitable. Returns true if at least one select was converted.
148  bool optimizeSelects(Function &F);
149 
150  // Heuristics for determining which select instructions can be profitably
151  // conveted to branches. Separate heuristics for selects in inner-most loops
152  // and the rest of code regions (base heuristics for non-inner-most loop
153  // regions).
154  void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
155  void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
156 
157  // Converts to branches the select groups that were deemed
158  // profitable-to-convert.
159  void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
160 
161  // Splits selects of a given basic block into select groups.
162  void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
163 
164  // Determines for which select groups it is profitable converting to branches
165  // (base and inner-most-loop heuristics).
166  void findProfitableSIGroupsBase(SelectGroups &SIGroups,
167  SelectGroups &ProfSIGroups);
168  void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
169  SelectGroups &ProfSIGroups);
170 
171  // Determines if a select group should be converted to a branch (base
172  // heuristics).
173  bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
174 
175  // Returns true if there are expensive instructions in the cold value
176  // operand's (if any) dependence slice of any of the selects of the given
177  // group.
178  bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
179 
180  // For a given source instruction, collect its backwards dependence slice
181  // consisting of instructions exclusively computed for producing the operands
182  // of the source instruction.
183  void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
184  Instruction *SI, bool ForSinking = false);
185 
186  // Returns true if the condition of the select is highly predictable.
187  bool isSelectHighlyPredictable(const SelectInst *SI);
188 
189  // Loop-level checks to determine if a non-predicated version (with branches)
190  // of the given loop is more profitable than its predicated version.
191  bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
192 
193  // Computes instruction and loop-critical-path costs for both the predicated
194  // and non-predicated version of the given loop.
195  bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
197  CostInfo *LoopCost);
198 
199  // Returns a set of all the select instructions in the given select groups.
200  SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
201 
202  // Returns the latency cost of a given instruction.
203  Optional<uint64_t> computeInstCost(const Instruction *I);
204 
205  // Returns the misprediction cost of a given select when converted to branch.
206  Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
207 
208  // Returns the cost of a branch when the prediction is correct.
209  Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
210  const SelectInst *SI);
211 
212  // Returns true if the target architecture supports lowering a given select.
213  bool isSelectKindSupported(SelectInst *SI);
214 };
215 } // namespace
216 
217 char SelectOptimize::ID = 0;
218 
219 INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
220  false)
227 INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
228  false)
229 
230 FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
231 
233  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
234  TSI = TM->getSubtargetImpl(F);
235  TLI = TSI->getTargetLowering();
236 
237  // If none of the select types is supported then skip this pass.
238  // This is an optimization pass. Legality issues will be handled by
239  // instruction selection.
240  if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
241  !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
242  !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
243  return false;
244 
245  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
246 
247  if (!TTI->enableSelectOptimize())
248  return false;
249 
250  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
251  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
252  BPI.reset(new BranchProbabilityInfo(F, *LI));
253  BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
254  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
255  ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
256  TSchedModel.init(TSI);
257 
258  // When optimizing for size, selects are preferable over branches.
259  if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI.get()))
260  return false;
261 
262  return optimizeSelects(F);
263 }
264 
265 bool SelectOptimize::optimizeSelects(Function &F) {
266  // Determine for which select groups it is profitable converting to branches.
267  SelectGroups ProfSIGroups;
268  // Base heuristics apply only to non-loops and outer loops.
269  optimizeSelectsBase(F, ProfSIGroups);
270  // Separate heuristics for inner-most loops.
271  optimizeSelectsInnerLoops(F, ProfSIGroups);
272 
273  // Convert to branches the select groups that were deemed
274  // profitable-to-convert.
275  convertProfitableSIGroups(ProfSIGroups);
276 
277  // Code modified if at least one select group was converted.
278  return !ProfSIGroups.empty();
279 }
280 
281 void SelectOptimize::optimizeSelectsBase(Function &F,
282  SelectGroups &ProfSIGroups) {
283  // Collect all the select groups.
284  SelectGroups SIGroups;
285  for (BasicBlock &BB : F) {
286  // Base heuristics apply only to non-loops and outer loops.
287  Loop *L = LI->getLoopFor(&BB);
288  if (L && L->isInnermost())
289  continue;
290  collectSelectGroups(BB, SIGroups);
291  }
292 
293  // Determine for which select groups it is profitable converting to branches.
294  findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
295 }
296 
297 void SelectOptimize::optimizeSelectsInnerLoops(Function &F,
298  SelectGroups &ProfSIGroups) {
299  SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
300  // Need to check size on each iteration as we accumulate child loops.
301  for (unsigned long i = 0; i < Loops.size(); ++i)
302  for (Loop *ChildL : Loops[i]->getSubLoops())
303  Loops.push_back(ChildL);
304 
305  for (Loop *L : Loops) {
306  if (!L->isInnermost())
307  continue;
308 
309  SelectGroups SIGroups;
310  for (BasicBlock *BB : L->getBlocks())
311  collectSelectGroups(*BB, SIGroups);
312 
313  findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
314  }
315 }
316 
317 /// If \p isTrue is true, return the true value of \p SI, otherwise return
318 /// false value of \p SI. If the true/false value of \p SI is defined by any
319 /// select instructions in \p Selects, look through the defining select
320 /// instruction until the true/false value is not defined in \p Selects.
321 static Value *
323  const SmallPtrSet<const Instruction *, 2> &Selects) {
324  Value *V = nullptr;
325  for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
326  DefSI = dyn_cast<SelectInst>(V)) {
327  assert(DefSI->getCondition() == SI->getCondition() &&
328  "The condition of DefSI does not match with SI");
329  V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
330  }
331  assert(V && "Failed to get select true/false value");
332  return V;
333 }
334 
335 void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
336  for (SelectGroup &ASI : ProfSIGroups) {
337  // The code transformation here is a modified version of the sinking
338  // transformation in CodeGenPrepare::optimizeSelectInst with a more
339  // aggressive strategy of which instructions to sink.
340  //
341  // TODO: eliminate the redundancy of logic transforming selects to branches
342  // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
343  // selects for all cases (with and without profile information).
344 
345  // Transform a sequence like this:
346  // start:
347  // %cmp = cmp uge i32 %a, %b
348  // %sel = select i1 %cmp, i32 %c, i32 %d
349  //
350  // Into:
351  // start:
352  // %cmp = cmp uge i32 %a, %b
353  // %cmp.frozen = freeze %cmp
354  // br i1 %cmp.frozen, label %select.true, label %select.false
355  // select.true:
356  // br label %select.end
357  // select.false:
358  // br label %select.end
359  // select.end:
360  // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
361  //
362  // %cmp should be frozen, otherwise it may introduce undefined behavior.
363  // In addition, we may sink instructions that produce %c or %d into the
364  // destination(s) of the new branch.
365  // If the true or false blocks do not contain a sunken instruction, that
366  // block and its branch may be optimized away. In that case, one side of the
367  // first branch will point directly to select.end, and the corresponding PHI
368  // predecessor block will be the start block.
369 
370  // Find all the instructions that can be soundly sunk to the true/false
371  // blocks. These are instructions that are computed solely for producing the
372  // operands of the select instructions in the group and can be sunk without
373  // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
374  // with side effects).
375  SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
376  typedef std::stack<Instruction *>::size_type StackSizeType;
377  StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
378  for (SelectInst *SI : ASI) {
379  // For each select, compute the sinkable dependence chains of the true and
380  // false operands.
381  if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
382  std::stack<Instruction *> TrueSlice;
383  getExclBackwardsSlice(TI, TrueSlice, SI, true);
384  maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
385  TrueSlices.push_back(TrueSlice);
386  }
387  if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
388  std::stack<Instruction *> FalseSlice;
389  getExclBackwardsSlice(FI, FalseSlice, SI, true);
390  maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
391  FalseSlices.push_back(FalseSlice);
392  }
393  }
394  // In the case of multiple select instructions in the same group, the order
395  // of non-dependent instructions (instructions of different dependence
396  // slices) in the true/false blocks appears to affect performance.
397  // Interleaving the slices seems to experimentally be the optimal approach.
398  // This interleaving scheduling allows for more ILP (with a natural downside
399  // of increasing a bit register pressure) compared to a simple ordering of
400  // one whole chain after another. One would expect that this ordering would
401  // not matter since the scheduling in the backend of the compiler would
402  // take care of it, but apparently the scheduler fails to deliver optimal
403  // ILP with a naive ordering here.
404  SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
405  for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
406  for (auto &S : TrueSlices) {
407  if (!S.empty()) {
408  TrueSlicesInterleaved.push_back(S.top());
409  S.pop();
410  }
411  }
412  }
413  for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
414  for (auto &S : FalseSlices) {
415  if (!S.empty()) {
416  FalseSlicesInterleaved.push_back(S.top());
417  S.pop();
418  }
419  }
420  }
421 
422  // We split the block containing the select(s) into two blocks.
423  SelectInst *SI = ASI.front();
424  SelectInst *LastSI = ASI.back();
425  BasicBlock *StartBlock = SI->getParent();
426  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
427  BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
428  BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
429  // Delete the unconditional branch that was just created by the split.
430  StartBlock->getTerminator()->eraseFromParent();
431 
432  // Move any debug/pseudo instructions that were in-between the select
433  // group to the newly-created end block.
434  SmallVector<Instruction *, 2> DebugPseudoINS;
435  auto DIt = SI->getIterator();
436  while (&*DIt != LastSI) {
437  if (DIt->isDebugOrPseudoInst())
438  DebugPseudoINS.push_back(&*DIt);
439  DIt++;
440  }
441  for (auto *DI : DebugPseudoINS) {
442  DI->moveBefore(&*EndBlock->getFirstInsertionPt());
443  }
444 
445  // These are the new basic blocks for the conditional branch.
446  // At least one will become an actual new basic block.
447  BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
448  BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
449  if (!TrueSlicesInterleaved.empty()) {
450  TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
451  EndBlock->getParent(), EndBlock);
452  TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
453  TrueBranch->setDebugLoc(LastSI->getDebugLoc());
454  for (Instruction *TrueInst : TrueSlicesInterleaved)
455  TrueInst->moveBefore(TrueBranch);
456  }
457  if (!FalseSlicesInterleaved.empty()) {
458  FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
459  EndBlock->getParent(), EndBlock);
460  FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
461  FalseBranch->setDebugLoc(LastSI->getDebugLoc());
462  for (Instruction *FalseInst : FalseSlicesInterleaved)
463  FalseInst->moveBefore(FalseBranch);
464  }
465  // If there was nothing to sink, then arbitrarily choose the 'false' side
466  // for a new input value to the PHI.
467  if (TrueBlock == FalseBlock) {
468  assert(TrueBlock == nullptr &&
469  "Unexpected basic block transform while optimizing select");
470 
471  FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
472  EndBlock->getParent(), EndBlock);
473  auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
474  FalseBranch->setDebugLoc(SI->getDebugLoc());
475  }
476 
477  // Insert the real conditional branch based on the original condition.
478  // If we did not create a new block for one of the 'true' or 'false' paths
479  // of the condition, it means that side of the branch goes to the end block
480  // directly and the path originates from the start block from the point of
481  // view of the new PHI.
482  BasicBlock *TT, *FT;
483  if (TrueBlock == nullptr) {
484  TT = EndBlock;
485  FT = FalseBlock;
486  TrueBlock = StartBlock;
487  } else if (FalseBlock == nullptr) {
488  TT = TrueBlock;
489  FT = EndBlock;
490  FalseBlock = StartBlock;
491  } else {
492  TT = TrueBlock;
493  FT = FalseBlock;
494  }
495  IRBuilder<> IB(SI);
496  auto *CondFr =
497  IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
498  IB.CreateCondBr(CondFr, TT, FT, SI);
499 
501  INS.insert(ASI.begin(), ASI.end());
502  // Use reverse iterator because later select may use the value of the
503  // earlier select, and we need to propagate value through earlier select
504  // to get the PHI operand.
505  for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
506  SelectInst *SI = *It;
507  // The select itself is replaced with a PHI Node.
508  PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
509  PN->takeName(SI);
510  PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
511  PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
512  PN->setDebugLoc(SI->getDebugLoc());
513 
514  SI->replaceAllUsesWith(PN);
515  SI->eraseFromParent();
516  INS.erase(SI);
517  ++NumSelectsConverted;
518  }
519  }
520 }
521 
523  using namespace llvm::PatternMatch;
524 
525  // If the select is a logical-and/logical-or then it is better treated as a
526  // and/or by the backend.
528  m_LogicalOr(m_Value(), m_Value()))))
529  return true;
530 
531  return false;
532 }
533 
534 void SelectOptimize::collectSelectGroups(BasicBlock &BB,
535  SelectGroups &SIGroups) {
536  BasicBlock::iterator BBIt = BB.begin();
537  while (BBIt != BB.end()) {
538  Instruction *I = &*BBIt++;
539  if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
540  if (isSpecialSelect(SI))
541  continue;
542 
543  SelectGroup SIGroup;
544  SIGroup.push_back(SI);
545  while (BBIt != BB.end()) {
546  Instruction *NI = &*BBIt;
547  SelectInst *NSI = dyn_cast<SelectInst>(NI);
548  if (NSI && SI->getCondition() == NSI->getCondition()) {
549  SIGroup.push_back(NSI);
550  } else if (!NI->isDebugOrPseudoInst()) {
551  // Debug/pseudo instructions should be skipped and not prevent the
552  // formation of a select group.
553  break;
554  }
555  ++BBIt;
556  }
557 
558  // If the select type is not supported, no point optimizing it.
559  // Instruction selection will take care of it.
560  if (!isSelectKindSupported(SI))
561  continue;
562 
563  SIGroups.push_back(SIGroup);
564  }
565  }
566 }
567 
568 void SelectOptimize::findProfitableSIGroupsBase(SelectGroups &SIGroups,
569  SelectGroups &ProfSIGroups) {
570  for (SelectGroup &ASI : SIGroups) {
571  ++NumSelectOptAnalyzed;
572  if (isConvertToBranchProfitableBase(ASI))
573  ProfSIGroups.push_back(ASI);
574  }
575 }
576 
579  LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
580  ORE->emit(Rem);
581 }
582 
583 void SelectOptimize::findProfitableSIGroupsInnerLoops(
584  const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
585  NumSelectOptAnalyzed += SIGroups.size();
586  // For each select group in an inner-most loop,
587  // a branch is more preferable than a select/conditional-move if:
588  // i) conversion to branches for all the select groups of the loop satisfies
589  // loop-level heuristics including reducing the loop's critical path by
590  // some threshold (see SelectOptimize::checkLoopHeuristics); and
591  // ii) the total cost of the select group is cheaper with a branch compared
592  // to its predicated version. The cost is in terms of latency and the cost
593  // of a select group is the cost of its most expensive select instruction
594  // (assuming infinite resources and thus fully leveraging available ILP).
595 
597  CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
598  {Scaled64::getZero(), Scaled64::getZero()}};
599  if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
600  !checkLoopHeuristics(L, LoopCost)) {
601  return;
602  }
603 
604  for (SelectGroup &ASI : SIGroups) {
605  // Assuming infinite resources, the cost of a group of instructions is the
606  // cost of the most expensive instruction of the group.
607  Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
608  for (SelectInst *SI : ASI) {
609  SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
610  BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
611  }
612  if (BranchCost < SelectCost) {
613  OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
614  OR << "Profitable to convert to branch (loop analysis). BranchCost="
615  << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
616  << ". ";
617  EmitAndPrintRemark(ORE, OR);
618  ++NumSelectConvertedLoop;
619  ProfSIGroups.push_back(ASI);
620  } else {
621  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
622  ORmiss << "Select is more profitable (loop analysis). BranchCost="
623  << BranchCost.toString()
624  << ", SelectCost=" << SelectCost.toString() << ". ";
625  EmitAndPrintRemark(ORE, ORmiss);
626  }
627  }
628 }
629 
630 bool SelectOptimize::isConvertToBranchProfitableBase(
631  const SmallVector<SelectInst *, 2> &ASI) {
632  SelectInst *SI = ASI.front();
633  LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI << "\n");
634  OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
635  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
636 
637  // Skip cold basic blocks. Better to optimize for size for cold blocks.
638  if (PSI->isColdBlock(SI->getParent(), BFI.get())) {
639  ++NumSelectColdBB;
640  ORmiss << "Not converted to branch because of cold basic block. ";
641  EmitAndPrintRemark(ORE, ORmiss);
642  return false;
643  }
644 
645  // If unpredictable, branch form is less profitable.
646  if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
647  ++NumSelectUnPred;
648  ORmiss << "Not converted to branch because of unpredictable branch. ";
649  EmitAndPrintRemark(ORE, ORmiss);
650  return false;
651  }
652 
653  // If highly predictable, branch form is more profitable, unless a
654  // predictable select is inexpensive in the target architecture.
655  if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
656  ++NumSelectConvertedHighPred;
657  OR << "Converted to branch because of highly predictable branch. ";
658  EmitAndPrintRemark(ORE, OR);
659  return true;
660  }
661 
662  // Look for expensive instructions in the cold operand's (if any) dependence
663  // slice of any of the selects in the group.
664  if (hasExpensiveColdOperand(ASI)) {
665  ++NumSelectConvertedExpColdOperand;
666  OR << "Converted to branch because of expensive cold operand.";
667  EmitAndPrintRemark(ORE, OR);
668  return true;
669  }
670 
671  ORmiss << "Not profitable to convert to branch (base heuristic).";
672  EmitAndPrintRemark(ORE, ORmiss);
673  return false;
674 }
675 
677  uint64_t Denominator) {
678  return (Numerator + (Denominator / 2)) / Denominator;
679 }
680 
681 bool SelectOptimize::hasExpensiveColdOperand(
682  const SmallVector<SelectInst *, 2> &ASI) {
683  bool ColdOperand = false;
684  uint64_t TrueWeight, FalseWeight, TotalWeight;
685  if (extractBranchWeights(*ASI.front(), TrueWeight, FalseWeight)) {
686  uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
687  TotalWeight = TrueWeight + FalseWeight;
688  // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
689  ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
690  } else if (PSI->hasProfileSummary()) {
691  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
692  ORmiss << "Profile data available but missing branch-weights metadata for "
693  "select instruction. ";
694  EmitAndPrintRemark(ORE, ORmiss);
695  }
696  if (!ColdOperand)
697  return false;
698  // Check if the cold path's dependence slice is expensive for any of the
699  // selects of the group.
700  for (SelectInst *SI : ASI) {
701  Instruction *ColdI = nullptr;
702  uint64_t HotWeight;
703  if (TrueWeight < FalseWeight) {
704  ColdI = dyn_cast<Instruction>(SI->getTrueValue());
705  HotWeight = FalseWeight;
706  } else {
707  ColdI = dyn_cast<Instruction>(SI->getFalseValue());
708  HotWeight = TrueWeight;
709  }
710  if (ColdI) {
711  std::stack<Instruction *> ColdSlice;
712  getExclBackwardsSlice(ColdI, ColdSlice, SI);
713  InstructionCost SliceCost = 0;
714  while (!ColdSlice.empty()) {
715  SliceCost += TTI->getInstructionCost(ColdSlice.top(),
717  ColdSlice.pop();
718  }
719  // The colder the cold value operand of the select is the more expensive
720  // the cmov becomes for computing the cold value operand every time. Thus,
721  // the colder the cold operand is the more its cost counts.
722  // Get nearest integer cost adjusted for coldness.
723  InstructionCost AdjSliceCost =
724  divideNearest(SliceCost * HotWeight, TotalWeight);
725  if (AdjSliceCost >=
727  return true;
728  }
729  }
730  return false;
731 }
732 
733 // Check if it is safe to move LoadI next to the SI.
734 // Conservatively assume it is safe only if there is no instruction
735 // modifying memory in-between the load and the select instruction.
737  // Assume loads from different basic blocks are unsafe to move.
738  if (LoadI->getParent() != SI->getParent())
739  return false;
740  auto It = LoadI->getIterator();
741  while (&*It != SI) {
742  if (It->mayWriteToMemory())
743  return false;
744  It++;
745  }
746  return true;
747 }
748 
749 // For a given source instruction, collect its backwards dependence slice
750 // consisting of instructions exclusively computed for the purpose of producing
751 // the operands of the source instruction. As an approximation
752 // (sufficiently-accurate in practice), we populate this set with the
753 // instructions of the backwards dependence slice that only have one-use and
754 // form an one-use chain that leads to the source instruction.
755 void SelectOptimize::getExclBackwardsSlice(Instruction *I,
756  std::stack<Instruction *> &Slice,
757  Instruction *SI, bool ForSinking) {
759  std::queue<Instruction *> Worklist;
760  Worklist.push(I);
761  while (!Worklist.empty()) {
762  Instruction *II = Worklist.front();
763  Worklist.pop();
764 
765  // Avoid cycles.
766  if (!Visited.insert(II).second)
767  continue;
768 
769  if (!II->hasOneUse())
770  continue;
771 
772  // Cannot soundly sink instructions with side-effects.
773  // Terminator or phi instructions cannot be sunk.
774  // Avoid sinking other select instructions (should be handled separetely).
775  if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
776  isa<SelectInst>(II) || isa<PHINode>(II)))
777  continue;
778 
779  // Avoid sinking loads in order not to skip state-modifying instructions,
780  // that may alias with the loaded address.
781  // Only allow sinking of loads within the same basic block that are
782  // conservatively proven to be safe.
783  if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
784  continue;
785 
786  // Avoid considering instructions with less frequency than the source
787  // instruction (i.e., avoid colder code regions of the dependence slice).
788  if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
789  continue;
790 
791  // Eligible one-use instruction added to the dependence slice.
792  Slice.push(II);
793 
794  // Explore all the operands of the current instruction to expand the slice.
795  for (unsigned k = 0; k < II->getNumOperands(); ++k)
796  if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
797  Worklist.push(OpI);
798  }
799 }
800 
801 bool SelectOptimize::isSelectHighlyPredictable(const SelectInst *SI) {
802  uint64_t TrueWeight, FalseWeight;
803  if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
804  uint64_t Max = std::max(TrueWeight, FalseWeight);
805  uint64_t Sum = TrueWeight + FalseWeight;
806  if (Sum != 0) {
807  auto Probability = BranchProbability::getBranchProbability(Max, Sum);
808  if (Probability > TTI->getPredictableBranchThreshold())
809  return true;
810  }
811  }
812  return false;
813 }
814 
815 bool SelectOptimize::checkLoopHeuristics(const Loop *L,
816  const CostInfo LoopCost[2]) {
817  // Loop-level checks to determine if a non-predicated version (with branches)
818  // of the loop is more profitable than its predicated version.
819 
821  return true;
822 
823  OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
824  L->getHeader()->getFirstNonPHI());
825 
826  if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
827  LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
828  ORmissL << "No select conversion in the loop due to no reduction of loop's "
829  "critical path. ";
830  EmitAndPrintRemark(ORE, ORmissL);
831  return false;
832  }
833 
834  Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
835  LoopCost[1].PredCost - LoopCost[1].NonPredCost};
836 
837  // Profitably converting to branches need to reduce the loop's critical path
838  // by at least some threshold (absolute gain of GainCycleThreshold cycles and
839  // relative gain of 12.5%).
840  if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
841  Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
842  Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
843  ORmissL << "No select conversion in the loop due to small reduction of "
844  "loop's critical path. Gain="
845  << Gain[1].toString()
846  << ", RelativeGain=" << RelativeGain.toString() << "%. ";
847  EmitAndPrintRemark(ORE, ORmissL);
848  return false;
849  }
850 
851  // If the loop's critical path involves loop-carried dependences, the gradient
852  // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
853  // This check ensures that the latency reduction for the loop's critical path
854  // keeps decreasing with sufficient rate beyond the two analyzed loop
855  // iterations.
856  if (Gain[1] > Gain[0]) {
857  Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
858  (LoopCost[1].PredCost - LoopCost[0].PredCost);
859  if (GradientGain < Scaled64::get(GainGradientThreshold)) {
860  ORmissL << "No select conversion in the loop due to small gradient gain. "
861  "GradientGain="
862  << GradientGain.toString() << "%. ";
863  EmitAndPrintRemark(ORE, ORmissL);
864  return false;
865  }
866  }
867  // If the gain decreases it is not profitable to convert.
868  else if (Gain[1] < Gain[0]) {
869  ORmissL
870  << "No select conversion in the loop due to negative gradient gain. ";
871  EmitAndPrintRemark(ORE, ORmissL);
872  return false;
873  }
874 
875  // Non-predicated version of the loop is more profitable than its
876  // predicated version.
877  return true;
878 }
879 
880 // Computes instruction and loop-critical-path costs for both the predicated
881 // and non-predicated version of the given loop.
882 // Returns false if unable to compute these costs due to invalid cost of loop
883 // instruction(s).
884 bool SelectOptimize::computeLoopCosts(
885  const Loop *L, const SelectGroups &SIGroups,
886  DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
887  LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
888  << L->getHeader()->getName() << "\n");
889  const auto &SIset = getSIset(SIGroups);
890  // Compute instruction and loop-critical-path costs across two iterations for
891  // both predicated and non-predicated version.
892  const unsigned Iterations = 2;
893  for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
894  // Cost of the loop's critical path.
895  CostInfo &MaxCost = LoopCost[Iter];
896  for (BasicBlock *BB : L->getBlocks()) {
897  for (const Instruction &I : *BB) {
898  if (I.isDebugOrPseudoInst())
899  continue;
900  // Compute the predicated and non-predicated cost of the instruction.
901  Scaled64 IPredCost = Scaled64::getZero(),
902  INonPredCost = Scaled64::getZero();
903 
904  // Assume infinite resources that allow to fully exploit the available
905  // instruction-level parallelism.
906  // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
907  for (const Use &U : I.operands()) {
908  auto UI = dyn_cast<Instruction>(U.get());
909  if (!UI)
910  continue;
911  if (InstCostMap.count(UI)) {
912  IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
913  INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
914  }
915  }
916  auto ILatency = computeInstCost(&I);
917  if (!ILatency) {
918  OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
919  ORmissL << "Invalid instruction cost preventing analysis and "
920  "optimization of the inner-most loop containing this "
921  "instruction. ";
922  EmitAndPrintRemark(ORE, ORmissL);
923  return false;
924  }
925  IPredCost += Scaled64::get(ILatency.value());
926  INonPredCost += Scaled64::get(ILatency.value());
927 
928  // For a select that can be converted to branch,
929  // compute its cost as a branch (non-predicated cost).
930  //
931  // BranchCost = PredictedPathCost + MispredictCost
932  // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
933  // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
934  if (SIset.contains(&I)) {
935  auto SI = dyn_cast<SelectInst>(&I);
936 
937  Scaled64 TrueOpCost = Scaled64::getZero(),
938  FalseOpCost = Scaled64::getZero();
939  if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
940  if (InstCostMap.count(TI))
941  TrueOpCost = InstCostMap[TI].NonPredCost;
942  if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
943  if (InstCostMap.count(FI))
944  FalseOpCost = InstCostMap[FI].NonPredCost;
945  Scaled64 PredictedPathCost =
946  getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
947 
948  Scaled64 CondCost = Scaled64::getZero();
949  if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
950  if (InstCostMap.count(CI))
951  CondCost = InstCostMap[CI].NonPredCost;
952  Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
953 
954  INonPredCost = PredictedPathCost + MispredictCost;
955  }
956  LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
957  << INonPredCost << " for " << I << "\n");
958 
959  InstCostMap[&I] = {IPredCost, INonPredCost};
960  MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
961  MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
962  }
963  }
964  LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
965  << " MaxCost = " << MaxCost.PredCost << " "
966  << MaxCost.NonPredCost << "\n");
967  }
968  return true;
969 }
970 
972 SelectOptimize::getSIset(const SelectGroups &SIGroups) {
974  for (const SelectGroup &ASI : SIGroups)
975  for (const SelectInst *SI : ASI)
976  SIset.insert(SI);
977  return SIset;
978 }
979 
980 Optional<uint64_t> SelectOptimize::computeInstCost(const Instruction *I) {
981  InstructionCost ICost =
983  if (auto OC = ICost.getValue())
984  return Optional<uint64_t>(*OC);
985  return std::nullopt;
986 }
987 
989 SelectOptimize::getMispredictionCost(const SelectInst *SI,
990  const Scaled64 CondCost) {
991  uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
992 
993  // Account for the default misprediction rate when using a branch
994  // (conservatively set to 25% by default).
995  uint64_t MispredictRate = MispredictDefaultRate;
996  // If the select condition is obviously predictable, then the misprediction
997  // rate is zero.
998  if (isSelectHighlyPredictable(SI))
999  MispredictRate = 0;
1000 
1001  // CondCost is included to account for cases where the computation of the
1002  // condition is part of a long dependence chain (potentially loop-carried)
1003  // that would delay detection of a misprediction and increase its cost.
1004  Scaled64 MispredictCost =
1005  std::max(Scaled64::get(MispredictPenalty), CondCost) *
1006  Scaled64::get(MispredictRate);
1007  MispredictCost /= Scaled64::get(100);
1008 
1009  return MispredictCost;
1010 }
1011 
1012 // Returns the cost of a branch when the prediction is correct.
1013 // TrueCost * TrueProbability + FalseCost * FalseProbability.
1015 SelectOptimize::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1016  const SelectInst *SI) {
1017  Scaled64 PredPathCost;
1018  uint64_t TrueWeight, FalseWeight;
1019  if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
1020  uint64_t SumWeight = TrueWeight + FalseWeight;
1021  if (SumWeight != 0) {
1022  PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
1023  FalseCost * Scaled64::get(FalseWeight);
1024  PredPathCost /= Scaled64::get(SumWeight);
1025  return PredPathCost;
1026  }
1027  }
1028  // Without branch weight metadata, we assume 75% for the one path and 25% for
1029  // the other, and pick the result with the biggest cost.
1030  PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
1031  FalseCost * Scaled64::get(3) + TrueCost);
1032  PredPathCost /= Scaled64::get(4);
1033  return PredPathCost;
1034 }
1035 
1036 bool SelectOptimize::isSelectKindSupported(SelectInst *SI) {
1037  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
1038  if (VectorCond)
1039  return false;
1041  if (SI->getType()->isVectorTy())
1043  else
1044  SelectKind = TargetLowering::ScalarValSelect;
1045  return TLI->isSelectSupported(SelectKind);
1046 }
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:30
llvm::Instruction::isTerminator
bool isTerminator() const
Definition: Instruction.h:172
llvm::OptimizationRemarkMissed
Diagnostic information for missed-optimization remarks.
Definition: DiagnosticInfo.h:734
llvm::TargetTransformInfo::TCC_Expensive
@ TCC_Expensive
The cost of a 'div' instruction on x86.
Definition: TargetTransformInfo.h:246
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
Definition: TargetTransformInfo.cpp:225
llvm::SmallPtrSetImpl::erase
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:379
Optional.h
llvm::ARM::PredBlockMask::TT
@ TT
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::TargetTransformInfo::TCK_Latency
@ TCK_Latency
The latency of instruction.
Definition: TargetTransformInfo.h:220
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
llvm::Function
Definition: Function.h:60
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
isSafeToSinkLoad
static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI)
Definition: SelectOptimize.cpp:736
Pass.h
isSpecialSelect
static bool isSpecialSelect(SelectInst *SI)
Definition: SelectOptimize.cpp:522
Loops
Hexagon Hardware Loops
Definition: HexagonHardwareLoops.cpp:374
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
Statistic.h
SizeOpts.h
llvm::PatternMatch::m_CombineOr
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:218
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:173
llvm::IRBuilder<>
ColdOperandMaxCostMultiplier
static cl::opt< unsigned > ColdOperandMaxCostMultiplier("cold-operand-max-cost-multiplier", cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " "slice of a cold operand to be considered inexpensive."), cl::init(1), cl::Hidden)
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::LoopInfoWrapperPass
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:1293
selects
Optimize selects
Definition: SelectOptimize.cpp:227
getTrueOrFalseValue
static Value * getTrueOrFalseValue(SelectInst *SI, bool isTrue, const SmallPtrSet< const Instruction *, 2 > &Selects)
If isTrue is true, return the true value of SI, otherwise return false value of SI.
Definition: SelectOptimize.cpp:322
llvm::BasicBlock::splitBasicBlock
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:402
llvm::Optional< uint64_t >
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::BranchProbability::getBranchProbability
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
Definition: BranchProbability.cpp:53
GainRelativeThreshold
static cl::opt< unsigned > GainRelativeThreshold("select-opti-loop-relative-gain-threshold", cl::desc("Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), cl::init(8), cl::Hidden)
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::TargetTransformInfo::getPredictableBranchThreshold
BranchProbability getPredictableBranchThreshold() const
If a branch or a select condition is skewed in one direction by more than this factor,...
Definition: TargetTransformInfo.cpp:234
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::DiagnosticInfoOptimizationBase
Common features for diagnostics dealing with optimization remarks that are used by both IR and MIR pa...
Definition: DiagnosticInfo.h:413
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::Instruction::mayReadFromMemory
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
Definition: Instruction.cpp:613
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Instruction.h
TargetLowering.h
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition: BlockFrequencyInfo.h:37
llvm::shouldOptimizeForSize
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
Definition: MachineSizeOpts.cpp:183
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1782
TargetMachine.h
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition: TargetLowering.h:3506
llvm::BranchProbabilityInfo
Analysis providing branch probability information.
Definition: BranchProbabilityInfo.h:113
SI
@ SI
Definition: SIInstrInfo.cpp:7966
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:246
false
Definition: StackSlotColoring.cpp:141
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
llvm::Instruction
Definition: Instruction.h:42
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:306
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:209
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition: ProfileSummaryInfo.h:40
Passes.h
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
BasicBlock.h
llvm::cl::opt
Definition: CommandLine.h:1411
TargetSchedule.h
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:81
BranchProbabilityInfo.h
llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:30
uint64_t
ProfileSummaryInfo.h
llvm::TargetTransformInfoWrapperPass
Wrapper pass for TargetTransformInfo.
Definition: TargetTransformInfo.h:2648
llvm::PatternMatch::m_LogicalOr
LogicalOp_match< LHS, RHS, Instruction::Or > m_LogicalOr(const LHS &L, const RHS &R)
Matches L || R either in the form of L | R or L ? true : R.
Definition: PatternMatch.h:2572
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:2847
ProfDataUtils.h
llvm::BranchInst::Create
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
Definition: Instructions.h:3188
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AArch64PACKey::IB
@ IB
Definition: AArch64BaseInfo.h:820
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:446
llvm::ProfileSummaryInfoWrapperPass
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Definition: ProfileSummaryInfo.h:194
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:356
TargetPassConfig.h
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:79
llvm::SystemZISD::OC
@ OC
Definition: SystemZISelLowering.h:123
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1735
llvm::InstructionCost::getValue
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:88
DisableLoopLevelHeuristics
static cl::opt< bool > DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, cl::init(false), cl::desc("Disable loop-level heuristics."))
ColdOperandThreshold
static cl::opt< unsigned > ColdOperandThreshold("cold-operand-threshold", cl::desc("Maximum frequency of path for an operand to be considered cold."), cl::init(20), cl::Hidden)
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::LoopInfo
Definition: LoopInfo.h:1108
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::ScaledNumber::toString
std::string toString(unsigned Precision=DefaultPrecision)
Convert to a decimal representation in a string.
Definition: ScaledNumber.h:595
GainGradientThreshold
static cl::opt< unsigned > GainGradientThreshold("select-opti-loop-gradient-gain-threshold", cl::desc("Gradient gain threshold (%)."), cl::init(25), cl::Hidden)
llvm::BasicBlock::Create
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:97
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
TargetSubtargetInfo.h
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false, false) INITIALIZE_PASS_END(SelectOptimize
BlockFrequencyInfo.h
llvm::AMDGPUISD::BFI
@ BFI
Definition: AMDGPUISelLowering.h:432
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
ScaledNumber.h
llvm::TargetSubtargetInfo
TargetSubtargetInfo - Generic base class for all target subtargets.
Definition: TargetSubtargetInfo.h:62
llvm::BasicBlock::front
const Instruction & front() const
Definition: BasicBlock.h:318
llvm::Instruction::mayHaveSideEffects
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
Definition: Instruction.cpp:732
GainCycleThreshold
static cl::opt< unsigned > GainCycleThreshold("select-opti-loop-cycle-gain-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
llvm::Instruction::isDebugOrPseudoInst
bool isDebugOrPseudoInst() const LLVM_READONLY
Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
Definition: Instruction.cpp:768
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:85
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:182
llvm::ScaledNumber< uint64_t >
llvm::OptimizationRemarkEmitterWrapperPass
OptimizationRemarkEmitter legacy analysis pass.
Definition: OptimizationRemarkEmitter.h:146
llvm::DiagnosticInfoOptimizationBase::getMsg
std::string getMsg() const
Definition: DiagnosticInfo.cpp:393
llvm::createSelectOptimizePass
FunctionPass * createSelectOptimizePass()
This pass converts conditional moves to conditional jumps when profitable.
Definition: SelectOptimize.cpp:230
get
Should compile to something r4 addze r3 instead we get
Definition: README.txt:24
llvm::PHINode::Create
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
Definition: Instructions.h:2739
llvm::TargetLoweringBase::ScalarValSelect
@ ScalarValSelect
Definition: TargetLowering.h:238
Function.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:105
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SelectOptimize.cpp:46
llvm::TargetLoweringBase::VectorMaskSelect
@ VectorMaskSelect
Definition: TargetLowering.h:241
llvm::initializeSelectOptimizePass
void initializeSelectOptimizePass(PassRegistry &)
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:689
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
llvm::pdb::DbgHeaderType::Max
@ Max
EmitAndPrintRemark
static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, DiagnosticInfoOptimizationBase &Rem)
Definition: SelectOptimize.cpp:577
SmallVector.h
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:359
llvm::TargetLoweringBase::ScalarCondVectorVal
@ ScalarCondVectorVal
Definition: TargetLowering.h:239
Dominators.h
llvm::TargetTransformInfo::enableSelectOptimize
bool enableSelectOptimize() const
Should the Select Optimization pass be enabled and ran.
Definition: TargetTransformInfo.cpp:550
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
TargetTransformInfo.h
llvm::PHINode
Definition: Instructions.h:2697
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:119
llvm::PatternMatch
Definition: PatternMatch.h:47
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::extractBranchWeights
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
Definition: ProfDataUtils.cpp:104
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
MispredictDefaultRate
static cl::opt< unsigned > MispredictDefaultRate("mispredict-default-rate", cl::Hidden, cl::init(25), cl::desc("Default mispredict rate (initialized to 25%)."))
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3132
llvm::divideNearest
uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator)
Returns the integer nearest(Numerator / Denominator).
Definition: MathExtras.h:688
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::TargetLoweringBase::SelectSupportKind
SelectSupportKind
Enum that describes what type of support for selects the target has.
Definition: TargetLowering.h:237
llvm::PatternMatch::m_LogicalAnd
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ? R : false.
Definition: PatternMatch.h:2554
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365