LLVM  14.0.0git
HexagonLoopIdiomRecognition.cpp
Go to the documentation of this file.
1 //===- HexagonLoopIdiomRecognition.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "llvm/ADT/APInt.h"
11 #include "llvm/ADT/DenseMap.h"
12 #include "llvm/ADT/SetVector.h"
13 #include "llvm/ADT/SmallPtrSet.h"
14 #include "llvm/ADT/SmallSet.h"
15 #include "llvm/ADT/SmallVector.h"
16 #include "llvm/ADT/StringRef.h"
17 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/LoopPass.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/Constant.h"
31 #include "llvm/IR/Constants.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DerivedTypes.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/IntrinsicsHexagon.h"
44 #include "llvm/IR/Module.h"
45 #include "llvm/IR/PassManager.h"
46 #include "llvm/IR/PatternMatch.h"
47 #include "llvm/IR/Type.h"
48 #include "llvm/IR/User.h"
49 #include "llvm/IR/Value.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/Pass.h"
52 #include "llvm/Support/Casting.h"
54 #include "llvm/Support/Compiler.h"
55 #include "llvm/Support/Debug.h"
57 #include "llvm/Support/KnownBits.h"
59 #include "llvm/Transforms/Scalar.h"
60 #include "llvm/Transforms/Utils.h"
63 #include <algorithm>
64 #include <array>
65 #include <cassert>
66 #include <cstdint>
67 #include <cstdlib>
68 #include <deque>
69 #include <functional>
70 #include <iterator>
71 #include <map>
72 #include <set>
73 #include <utility>
74 #include <vector>
75 
76 #define DEBUG_TYPE "hexagon-lir"
77 
78 using namespace llvm;
79 
80 static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
81  cl::Hidden, cl::init(false),
82  cl::desc("Disable generation of memcpy in loop idiom recognition"));
83 
84 static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
85  cl::Hidden, cl::init(false),
86  cl::desc("Disable generation of memmove in loop idiom recognition"));
87 
88 static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
89  cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
90  "check guarding the memmove."));
91 
93  "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
94  cl::desc("Threshold (in bytes) to perform the transformation, if the "
95  "runtime loop count (mem transfer size) is known at compile-time."));
96 
97 static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
98  cl::Hidden, cl::init(true),
99  cl::desc("Only enable generating memmove in non-nested loops"));
100 
102  "disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false),
103  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
104 
105 static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
106  cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
107 
108 static const char *HexagonVolatileMemcpyName
109  = "hexagon_memcpy_forward_vp4cp4n2";
110 
111 
112 namespace llvm {
113 
116 
117 } // end namespace llvm
118 
119 namespace {
120 
121 class HexagonLoopIdiomRecognize {
122 public:
123  explicit HexagonLoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
124  LoopInfo *LF, const TargetLibraryInfo *TLI,
125  ScalarEvolution *SE)
126  : AA(AA), DT(DT), LF(LF), TLI(TLI), SE(SE) {}
127 
128  bool run(Loop *L);
129 
130 private:
131  int getSCEVStride(const SCEVAddRecExpr *StoreEv);
132  bool isLegalStore(Loop *CurLoop, StoreInst *SI);
133  void collectStores(Loop *CurLoop, BasicBlock *BB,
135  bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
136  bool coverLoop(Loop *L, SmallVectorImpl<Instruction *> &Insts) const;
137  bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
138  SmallVectorImpl<BasicBlock *> &ExitBlocks);
139  bool runOnCountableLoop(Loop *L);
140 
141  AliasAnalysis *AA;
142  const DataLayout *DL;
143  DominatorTree *DT;
144  LoopInfo *LF;
145  const TargetLibraryInfo *TLI;
146  ScalarEvolution *SE;
147  bool HasMemcpy, HasMemmove;
148 };
149 
150 class HexagonLoopIdiomRecognizeLegacyPass : public LoopPass {
151 public:
152  static char ID;
153 
154  explicit HexagonLoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
157  }
158 
159  StringRef getPassName() const override {
160  return "Recognize Hexagon-specific loop idioms";
161  }
162 
163  void getAnalysisUsage(AnalysisUsage &AU) const override {
172  }
173 
174  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
175 };
176 
177 struct Simplifier {
178  struct Rule {
179  using FuncType = std::function<Value *(Instruction *, LLVMContext &)>;
180  Rule(StringRef N, FuncType F) : Name(N), Fn(F) {}
181  StringRef Name; // For debugging.
182  FuncType Fn;
183  };
184 
185  void addRule(StringRef N, const Rule::FuncType &F) {
186  Rules.push_back(Rule(N, F));
187  }
188 
189 private:
190  struct WorkListType {
191  WorkListType() = default;
192 
193  void push_back(Value *V) {
194  // Do not push back duplicates.
195  if (!S.count(V)) {
196  Q.push_back(V);
197  S.insert(V);
198  }
199  }
200 
201  Value *pop_front_val() {
202  Value *V = Q.front();
203  Q.pop_front();
204  S.erase(V);
205  return V;
206  }
207 
208  bool empty() const { return Q.empty(); }
209 
210  private:
211  std::deque<Value *> Q;
212  std::set<Value *> S;
213  };
214 
215  using ValueSetType = std::set<Value *>;
216 
217  std::vector<Rule> Rules;
218 
219 public:
220  struct Context {
221  using ValueMapType = DenseMap<Value *, Value *>;
222 
223  Value *Root;
224  ValueSetType Used; // The set of all cloned values used by Root.
225  ValueSetType Clones; // The set of all cloned values.
226  LLVMContext &Ctx;
227 
228  Context(Instruction *Exp)
229  : Ctx(Exp->getParent()->getParent()->getContext()) {
230  initialize(Exp);
231  }
232 
233  ~Context() { cleanup(); }
234 
235  void print(raw_ostream &OS, const Value *V) const;
236  Value *materialize(BasicBlock *B, BasicBlock::iterator At);
237 
238  private:
239  friend struct Simplifier;
240 
241  void initialize(Instruction *Exp);
242  void cleanup();
243 
244  template <typename FuncT> void traverse(Value *V, FuncT F);
245  void record(Value *V);
246  void use(Value *V);
247  void unuse(Value *V);
248 
249  bool equal(const Instruction *I, const Instruction *J) const;
250  Value *find(Value *Tree, Value *Sub) const;
251  Value *subst(Value *Tree, Value *OldV, Value *NewV);
252  void replace(Value *OldV, Value *NewV);
254  };
255 
256  Value *simplify(Context &C);
257 };
258 
259  struct PE {
260  PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
261 
262  const Simplifier::Context &C;
263  const Value *V;
264  };
265 
267  raw_ostream &operator<<(raw_ostream &OS, const PE &P) {
268  P.C.print(OS, P.V ? P.V : P.C.Root);
269  return OS;
270  }
271 
272 } // end anonymous namespace
273 
275 
276 INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
277  "Recognize Hexagon-specific loop idioms", false, false)
279 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
280 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
285 INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom",
286  "Recognize Hexagon-specific loop idioms", false, false)
287 
288 template <typename FuncT>
289 void Simplifier::Context::traverse(Value *V, FuncT F) {
290  WorkListType Q;
291  Q.push_back(V);
292 
293  while (!Q.empty()) {
294  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
295  if (!U || U->getParent())
296  continue;
297  if (!F(U))
298  continue;
299  for (Value *Op : U->operands())
300  Q.push_back(Op);
301  }
302 }
303 
304 void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
305  const auto *U = dyn_cast<const Instruction>(V);
306  if (!U) {
307  OS << V << '(' << *V << ')';
308  return;
309  }
310 
311  if (U->getParent()) {
312  OS << U << '(';
313  U->printAsOperand(OS, true);
314  OS << ')';
315  return;
316  }
317 
318  unsigned N = U->getNumOperands();
319  if (N != 0)
320  OS << U << '(';
321  OS << U->getOpcodeName();
322  for (const Value *Op : U->operands()) {
323  OS << ' ';
324  print(OS, Op);
325  }
326  if (N != 0)
327  OS << ')';
328 }
329 
331  // Perform a deep clone of the expression, set Root to the root
332  // of the clone, and build a map from the cloned values to the
333  // original ones.
334  ValueMapType M;
335  BasicBlock *Block = Exp->getParent();
336  WorkListType Q;
337  Q.push_back(Exp);
338 
339  while (!Q.empty()) {
340  Value *V = Q.pop_front_val();
341  if (M.find(V) != M.end())
342  continue;
343  if (Instruction *U = dyn_cast<Instruction>(V)) {
344  if (isa<PHINode>(U) || U->getParent() != Block)
345  continue;
346  for (Value *Op : U->operands())
347  Q.push_back(Op);
348  M.insert({U, U->clone()});
349  }
350  }
351 
352  for (std::pair<Value*,Value*> P : M) {
353  Instruction *U = cast<Instruction>(P.second);
354  for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
355  auto F = M.find(U->getOperand(i));
356  if (F != M.end())
357  U->setOperand(i, F->second);
358  }
359  }
360 
361  auto R = M.find(Exp);
362  assert(R != M.end());
363  Root = R->second;
364 
365  record(Root);
366  use(Root);
367 }
368 
369 void Simplifier::Context::record(Value *V) {
370  auto Record = [this](Instruction *U) -> bool {
371  Clones.insert(U);
372  return true;
373  };
374  traverse(V, Record);
375 }
376 
378  auto Use = [this](Instruction *U) -> bool {
379  Used.insert(U);
380  return true;
381  };
382  traverse(V, Use);
383 }
384 
385 void Simplifier::Context::unuse(Value *V) {
386  if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
387  return;
388 
389  auto Unuse = [this](Instruction *U) -> bool {
390  if (!U->use_empty())
391  return false;
392  Used.erase(U);
393  return true;
394  };
395  traverse(V, Unuse);
396 }
397 
398 Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
399  if (Tree == OldV)
400  return NewV;
401  if (OldV == NewV)
402  return Tree;
403 
404  WorkListType Q;
405  Q.push_back(Tree);
406  while (!Q.empty()) {
407  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
408  // If U is not an instruction, or it's not a clone, skip it.
409  if (!U || U->getParent())
410  continue;
411  for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
412  Value *Op = U->getOperand(i);
413  if (Op == OldV) {
414  U->setOperand(i, NewV);
415  unuse(OldV);
416  } else {
417  Q.push_back(Op);
418  }
419  }
420  }
421  return Tree;
422 }
423 
424 void Simplifier::Context::replace(Value *OldV, Value *NewV) {
425  if (Root == OldV) {
426  Root = NewV;
427  use(Root);
428  return;
429  }
430 
431  // NewV may be a complex tree that has just been created by one of the
432  // transformation rules. We need to make sure that it is commoned with
433  // the existing Root to the maximum extent possible.
434  // Identify all subtrees of NewV (including NewV itself) that have
435  // equivalent counterparts in Root, and replace those subtrees with
436  // these counterparts.
437  WorkListType Q;
438  Q.push_back(NewV);
439  while (!Q.empty()) {
440  Value *V = Q.pop_front_val();
441  Instruction *U = dyn_cast<Instruction>(V);
442  if (!U || U->getParent())
443  continue;
444  if (Value *DupV = find(Root, V)) {
445  if (DupV != V)
446  NewV = subst(NewV, V, DupV);
447  } else {
448  for (Value *Op : U->operands())
449  Q.push_back(Op);
450  }
451  }
452 
453  // Now, simply replace OldV with NewV in Root.
454  Root = subst(Root, OldV, NewV);
455  use(Root);
456 }
457 
459  for (Value *V : Clones) {
460  Instruction *U = cast<Instruction>(V);
461  if (!U->getParent())
462  U->dropAllReferences();
463  }
464 
465  for (Value *V : Clones) {
466  Instruction *U = cast<Instruction>(V);
467  if (!U->getParent())
468  U->deleteValue();
469  }
470 }
471 
473  const Instruction *J) const {
474  if (I == J)
475  return true;
476  if (!I->isSameOperationAs(J))
477  return false;
478  if (isa<PHINode>(I))
479  return I->isIdenticalTo(J);
480 
481  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
482  Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
483  if (OpI == OpJ)
484  continue;
485  auto *InI = dyn_cast<const Instruction>(OpI);
486  auto *InJ = dyn_cast<const Instruction>(OpJ);
487  if (InI && InJ) {
488  if (!equal(InI, InJ))
489  return false;
490  } else if (InI != InJ || !InI)
491  return false;
492  }
493  return true;
494 }
495 
496 Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
497  Instruction *SubI = dyn_cast<Instruction>(Sub);
498  WorkListType Q;
499  Q.push_back(Tree);
500 
501  while (!Q.empty()) {
502  Value *V = Q.pop_front_val();
503  if (V == Sub)
504  return V;
505  Instruction *U = dyn_cast<Instruction>(V);
506  if (!U || U->getParent())
507  continue;
508  if (SubI && equal(SubI, U))
509  return U;
510  assert(!isa<PHINode>(U));
511  for (Value *Op : U->operands())
512  Q.push_back(Op);
513  }
514  return nullptr;
515 }
516 
519  if (I->getParent())
520  return;
521 
522  for (Value *Op : I->operands()) {
523  if (Instruction *OpI = dyn_cast<Instruction>(Op))
524  link(OpI, B, At);
525  }
526 
527  B->getInstList().insert(At, I);
528 }
529 
530 Value *Simplifier::Context::materialize(BasicBlock *B,
532  if (Instruction *RootI = dyn_cast<Instruction>(Root))
533  link(RootI, B, At);
534  return Root;
535 }
536 
538  WorkListType Q;
539  Q.push_back(C.Root);
540  unsigned Count = 0;
541  const unsigned Limit = SimplifyLimit;
542 
543  while (!Q.empty()) {
544  if (Count++ >= Limit)
545  break;
546  Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
547  if (!U || U->getParent() || !C.Used.count(U))
548  continue;
549  bool Changed = false;
550  for (Rule &R : Rules) {
551  Value *W = R.Fn(U, C.Ctx);
552  if (!W)
553  continue;
554  Changed = true;
555  C.record(W);
556  C.replace(U, W);
557  Q.push_back(C.Root);
558  break;
559  }
560  if (!Changed) {
561  for (Value *Op : U->operands())
562  Q.push_back(Op);
563  }
564  }
565  return Count < Limit ? C.Root : nullptr;
566 }
567 
568 //===----------------------------------------------------------------------===//
569 //
570 // Implementation of PolynomialMultiplyRecognize
571 //
572 //===----------------------------------------------------------------------===//
573 
574 namespace {
575 
576  class PolynomialMultiplyRecognize {
577  public:
578  explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
579  const DominatorTree &dt, const TargetLibraryInfo &tli,
580  ScalarEvolution &se)
581  : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
582 
583  bool recognize();
584 
585  private:
586  using ValueSeq = SetVector<Value *>;
587 
588  IntegerType *getPmpyType() const {
589  LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
590  return IntegerType::get(Ctx, 32);
591  }
592 
593  bool isPromotableTo(Value *V, IntegerType *Ty);
594  void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
595  bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
596 
597  Value *getCountIV(BasicBlock *BB);
598  bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
599  void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
600  ValueSeq &Late);
601  bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
602  bool commutesWithShift(Instruction *I);
603  bool highBitsAreZero(Value *V, unsigned IterCount);
604  bool keepsHighBitsZero(Value *V, unsigned IterCount);
605  bool isOperandShifted(Instruction *I, Value *Op);
606  bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
607  unsigned IterCount);
608  void cleanupLoopBody(BasicBlock *LoopB);
609 
610  struct ParsedValues {
611  ParsedValues() = default;
612 
613  Value *M = nullptr;
614  Value *P = nullptr;
615  Value *Q = nullptr;
616  Value *R = nullptr;
617  Value *X = nullptr;
618  Instruction *Res = nullptr;
619  unsigned IterCount = 0;
620  bool Left = false;
621  bool Inv = false;
622  };
623 
624  bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
625  bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
626  bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
627  Value *CIV, ParsedValues &PV, bool PreScan);
628  unsigned getInverseMxN(unsigned QP);
629  Value *generate(BasicBlock::iterator At, ParsedValues &PV);
630 
631  void setupPreSimplifier(Simplifier &S);
632  void setupPostSimplifier(Simplifier &S);
633 
634  Loop *CurLoop;
635  const DataLayout &DL;
636  const DominatorTree &DT;
637  const TargetLibraryInfo &TLI;
638  ScalarEvolution &SE;
639  };
640 
641 } // end anonymous namespace
642 
643 Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
644  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
645  if (std::distance(PI, PE) != 2)
646  return nullptr;
647  BasicBlock *PB = (*PI == BB) ? *std::next(PI) : *PI;
648 
649  for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
650  auto *PN = cast<PHINode>(I);
651  Value *InitV = PN->getIncomingValueForBlock(PB);
652  if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
653  continue;
654  Value *IterV = PN->getIncomingValueForBlock(BB);
655  auto *BO = dyn_cast<BinaryOperator>(IterV);
656  if (!BO)
657  continue;
658  if (BO->getOpcode() != Instruction::Add)
659  continue;
660  Value *IncV = nullptr;
661  if (BO->getOperand(0) == PN)
662  IncV = BO->getOperand(1);
663  else if (BO->getOperand(1) == PN)
664  IncV = BO->getOperand(0);
665  if (IncV == nullptr)
666  continue;
667 
668  if (auto *T = dyn_cast<ConstantInt>(IncV))
669  if (T->getZExtValue() == 1)
670  return PN;
671  }
672  return nullptr;
673 }
674 
676  for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
677  Use &TheUse = UI.getUse();
678  ++UI;
679  if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
680  if (BB == II->getParent())
681  II->replaceUsesOfWith(I, J);
682  }
683 }
684 
685 bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
686  Value *CIV, ParsedValues &PV) {
687  // Match the following:
688  // select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
689  // select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
690  // The condition may also check for equality with the masked value, i.e
691  // select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
692  // select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
693 
694  Value *CondV = SelI->getCondition();
695  Value *TrueV = SelI->getTrueValue();
696  Value *FalseV = SelI->getFalseValue();
697 
698  using namespace PatternMatch;
699 
701  Value *A = nullptr, *B = nullptr, *C = nullptr;
702 
703  if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
704  !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
705  return false;
706  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
707  return false;
708  // Matched: select (A & B) == C ? ... : ...
709  // select (A & B) != C ? ... : ...
710 
711  Value *X = nullptr, *Sh1 = nullptr;
712  // Check (A & B) for (X & (1 << i)):
713  if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
714  Sh1 = A;
715  X = B;
716  } else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
717  Sh1 = B;
718  X = A;
719  } else {
720  // TODO: Could also check for an induction variable containing single
721  // bit shifted left by 1 in each iteration.
722  return false;
723  }
724 
725  bool TrueIfZero;
726 
727  // Check C against the possible values for comparison: 0 and (1 << i):
728  if (match(C, m_Zero()))
729  TrueIfZero = (P == CmpInst::ICMP_EQ);
730  else if (C == Sh1)
731  TrueIfZero = (P == CmpInst::ICMP_NE);
732  else
733  return false;
734 
735  // So far, matched:
736  // select (X & (1 << i)) ? ... : ...
737  // including variations of the check against zero/non-zero value.
738 
739  Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
740  if (TrueIfZero) {
741  ShouldSameV = TrueV;
742  ShouldXoredV = FalseV;
743  } else {
744  ShouldSameV = FalseV;
745  ShouldXoredV = TrueV;
746  }
747 
748  Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
749  Value *T = nullptr;
750  if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
751  // Matched: select +++ ? ... : Y ^ Z
752  // select +++ ? Y ^ Z : ...
753  // where +++ denotes previously checked matches.
754  if (ShouldSameV == Y)
755  T = Z;
756  else if (ShouldSameV == Z)
757  T = Y;
758  else
759  return false;
760  R = ShouldSameV;
761  // Matched: select +++ ? R : R ^ T
762  // select +++ ? R ^ T : R
763  // depending on TrueIfZero.
764 
765  } else if (match(ShouldSameV, m_Zero())) {
766  // Matched: select +++ ? 0 : ...
767  // select +++ ? ... : 0
768  if (!SelI->hasOneUse())
769  return false;
770  T = ShouldXoredV;
771  // Matched: select +++ ? 0 : T
772  // select +++ ? T : 0
773 
774  Value *U = *SelI->user_begin();
775  if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
776  !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
777  return false;
778  // Matched: xor (select +++ ? 0 : T), R
779  // xor (select +++ ? T : 0), R
780  } else
781  return false;
782 
783  // The xor input value T is isolated into its own match so that it could
784  // be checked against an induction variable containing a shifted bit
785  // (todo).
786  // For now, check against (Q << i).
787  if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
788  !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
789  return false;
790  // Matched: select +++ ? R : R ^ (Q << i)
791  // select +++ ? R ^ (Q << i) : R
792 
793  PV.X = X;
794  PV.Q = Q;
795  PV.R = R;
796  PV.Left = true;
797  return true;
798 }
799 
800 bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
801  ParsedValues &PV) {
802  // Match the following:
803  // select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
804  // select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
805  // The condition may also check for equality with the masked value, i.e
806  // select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
807  // select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
808 
809  Value *CondV = SelI->getCondition();
810  Value *TrueV = SelI->getTrueValue();
811  Value *FalseV = SelI->getFalseValue();
812 
813  using namespace PatternMatch;
814 
815  Value *C = nullptr;
817  bool TrueIfZero;
818 
819  if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
820  match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
821  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
822  return false;
823  // Matched: select C == 0 ? ... : ...
824  // select C != 0 ? ... : ...
825  TrueIfZero = (P == CmpInst::ICMP_EQ);
826  } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
827  match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
828  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
829  return false;
830  // Matched: select C == 1 ? ... : ...
831  // select C != 1 ? ... : ...
832  TrueIfZero = (P == CmpInst::ICMP_NE);
833  } else
834  return false;
835 
836  Value *X = nullptr;
837  if (!match(C, m_And(m_Value(X), m_One())) &&
838  !match(C, m_And(m_One(), m_Value(X))))
839  return false;
840  // Matched: select (X & 1) == +++ ? ... : ...
841  // select (X & 1) != +++ ? ... : ...
842 
843  Value *R = nullptr, *Q = nullptr;
844  if (TrueIfZero) {
845  // The select's condition is true if the tested bit is 0.
846  // TrueV must be the shift, FalseV must be the xor.
847  if (!match(TrueV, m_LShr(m_Value(R), m_One())))
848  return false;
849  // Matched: select +++ ? (R >> 1) : ...
850  if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
851  !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
852  return false;
853  // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
854  // with commuting ^.
855  } else {
856  // The select's condition is true if the tested bit is 1.
857  // TrueV must be the xor, FalseV must be the shift.
858  if (!match(FalseV, m_LShr(m_Value(R), m_One())))
859  return false;
860  // Matched: select +++ ? ... : (R >> 1)
861  if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
862  !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
863  return false;
864  // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
865  // with commuting ^.
866  }
867 
868  PV.X = X;
869  PV.Q = Q;
870  PV.R = R;
871  PV.Left = false;
872  return true;
873 }
874 
875 bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
876  BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
877  bool PreScan) {
878  using namespace PatternMatch;
879 
880  // The basic pattern for R = P.Q is:
881  // for i = 0..31
882  // R = phi (0, R')
883  // if (P & (1 << i)) ; test-bit(P, i)
884  // R' = R ^ (Q << i)
885  //
886  // Similarly, the basic pattern for R = (P/Q).Q - P
887  // for i = 0..31
888  // R = phi(P, R')
889  // if (R & (1 << i))
890  // R' = R ^ (Q << i)
891 
892  // There exist idioms, where instead of Q being shifted left, P is shifted
893  // right. This produces a result that is shifted right by 32 bits (the
894  // non-shifted result is 64-bit).
895  //
896  // For R = P.Q, this would be:
897  // for i = 0..31
898  // R = phi (0, R')
899  // if ((P >> i) & 1)
900  // R' = (R >> 1) ^ Q ; R is cycled through the loop, so it must
901  // else ; be shifted by 1, not i.
902  // R' = R >> 1
903  //
904  // And for the inverse:
905  // for i = 0..31
906  // R = phi (P, R')
907  // if (R & 1)
908  // R' = (R >> 1) ^ Q
909  // else
910  // R' = R >> 1
911 
912  // The left-shifting idioms share the same pattern:
913  // select (X & (1 << i)) ? R ^ (Q << i) : R
914  // Similarly for right-shifting idioms:
915  // select (X & 1) ? (R >> 1) ^ Q
916 
917  if (matchLeftShift(SelI, CIV, PV)) {
918  // If this is a pre-scan, getting this far is sufficient.
919  if (PreScan)
920  return true;
921 
922  // Need to make sure that the SelI goes back into R.
923  auto *RPhi = dyn_cast<PHINode>(PV.R);
924  if (!RPhi)
925  return false;
926  if (SelI != RPhi->getIncomingValueForBlock(LoopB))
927  return false;
928  PV.Res = SelI;
929 
930  // If X is loop invariant, it must be the input polynomial, and the
931  // idiom is the basic polynomial multiply.
932  if (CurLoop->isLoopInvariant(PV.X)) {
933  PV.P = PV.X;
934  PV.Inv = false;
935  } else {
936  // X is not loop invariant. If X == R, this is the inverse pmpy.
937  // Otherwise, check for an xor with an invariant value. If the
938  // variable argument to the xor is R, then this is still a valid
939  // inverse pmpy.
940  PV.Inv = true;
941  if (PV.X != PV.R) {
942  Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
943  if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
944  return false;
945  auto *I1 = dyn_cast<Instruction>(X1);
946  auto *I2 = dyn_cast<Instruction>(X2);
947  if (!I1 || I1->getParent() != LoopB) {
948  Var = X2;
949  Inv = X1;
950  } else if (!I2 || I2->getParent() != LoopB) {
951  Var = X1;
952  Inv = X2;
953  } else
954  return false;
955  if (Var != PV.R)
956  return false;
957  PV.M = Inv;
958  }
959  // The input polynomial P still needs to be determined. It will be
960  // the entry value of R.
961  Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
962  PV.P = EntryP;
963  }
964 
965  return true;
966  }
967 
968  if (matchRightShift(SelI, PV)) {
969  // If this is an inverse pattern, the Q polynomial must be known at
970  // compile time.
971  if (PV.Inv && !isa<ConstantInt>(PV.Q))
972  return false;
973  if (PreScan)
974  return true;
975  // There is no exact matching of right-shift pmpy.
976  return false;
977  }
978 
979  return false;
980 }
981 
982 bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
983  IntegerType *DestTy) {
984  IntegerType *T = dyn_cast<IntegerType>(Val->getType());
985  if (!T || T->getBitWidth() > DestTy->getBitWidth())
986  return false;
987  if (T->getBitWidth() == DestTy->getBitWidth())
988  return true;
989  // Non-instructions are promotable. The reason why an instruction may not
990  // be promotable is that it may produce a different result if its operands
991  // and the result are promoted, for example, it may produce more non-zero
992  // bits. While it would still be possible to represent the proper result
993  // in a wider type, it may require adding additional instructions (which
994  // we don't want to do).
995  Instruction *In = dyn_cast<Instruction>(Val);
996  if (!In)
997  return true;
998  // The bitwidth of the source type is smaller than the destination.
999  // Check if the individual operation can be promoted.
1000  switch (In->getOpcode()) {
1001  case Instruction::PHI:
1002  case Instruction::ZExt:
1003  case Instruction::And:
1004  case Instruction::Or:
1005  case Instruction::Xor:
1006  case Instruction::LShr: // Shift right is ok.
1007  case Instruction::Select:
1008  case Instruction::Trunc:
1009  return true;
1010  case Instruction::ICmp:
1011  if (CmpInst *CI = cast<CmpInst>(In))
1012  return CI->isEquality() || CI->isUnsigned();
1013  llvm_unreachable("Cast failed unexpectedly");
1014  case Instruction::Add:
1015  return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
1016  }
1017  return false;
1018 }
1019 
1020 void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
1021  IntegerType *DestTy, BasicBlock *LoopB) {
1022  Type *OrigTy = In->getType();
1023  assert(!OrigTy->isVoidTy() && "Invalid instruction to promote");
1024 
1025  // Leave boolean values alone.
1026  if (!In->getType()->isIntegerTy(1))
1027  In->mutateType(DestTy);
1028  unsigned DestBW = DestTy->getBitWidth();
1029 
1030  // Handle PHIs.
1031  if (PHINode *P = dyn_cast<PHINode>(In)) {
1032  unsigned N = P->getNumIncomingValues();
1033  for (unsigned i = 0; i != N; ++i) {
1034  BasicBlock *InB = P->getIncomingBlock(i);
1035  if (InB == LoopB)
1036  continue;
1037  Value *InV = P->getIncomingValue(i);
1038  IntegerType *Ty = cast<IntegerType>(InV->getType());
1039  // Do not promote values in PHI nodes of type i1.
1040  if (Ty != P->getType()) {
1041  // If the value type does not match the PHI type, the PHI type
1042  // must have been promoted.
1043  assert(Ty->getBitWidth() < DestBW);
1044  InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
1045  P->setIncomingValue(i, InV);
1046  }
1047  }
1048  } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
1049  Value *Op = Z->getOperand(0);
1050  if (Op->getType() == Z->getType())
1051  Z->replaceAllUsesWith(Op);
1052  Z->eraseFromParent();
1053  return;
1054  }
1055  if (TruncInst *T = dyn_cast<TruncInst>(In)) {
1056  IntegerType *TruncTy = cast<IntegerType>(OrigTy);
1057  Value *Mask = ConstantInt::get(DestTy, (1u << TruncTy->getBitWidth()) - 1);
1058  Value *And = IRBuilder<>(In).CreateAnd(T->getOperand(0), Mask);
1059  T->replaceAllUsesWith(And);
1060  T->eraseFromParent();
1061  return;
1062  }
1063 
1064  // Promote immediates.
1065  for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
1066  if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
1067  if (CI->getType()->getBitWidth() < DestBW)
1068  In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
1069  }
1070 }
1071 
1072 bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
1073  BasicBlock *ExitB) {
1074  assert(LoopB);
1075  // Skip loops where the exit block has more than one predecessor. The values
1076  // coming from the loop block will be promoted to another type, and so the
1077  // values coming into the exit block from other predecessors would also have
1078  // to be promoted.
1079  if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
1080  return false;
1081  IntegerType *DestTy = getPmpyType();
1082  // Check if the exit values have types that are no wider than the type
1083  // that we want to promote to.
1084  unsigned DestBW = DestTy->getBitWidth();
1085  for (PHINode &P : ExitB->phis()) {
1086  if (P.getNumIncomingValues() != 1)
1087  return false;
1088  assert(P.getIncomingBlock(0) == LoopB);
1089  IntegerType *T = dyn_cast<IntegerType>(P.getType());
1090  if (!T || T->getBitWidth() > DestBW)
1091  return false;
1092  }
1093 
1094  // Check all instructions in the loop.
1095  for (Instruction &In : *LoopB)
1096  if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
1097  return false;
1098 
1099  // Perform the promotion.
1100  std::vector<Instruction*> LoopIns;
1101  std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
1102  [](Instruction &In) { return &In; });
1103  for (Instruction *In : LoopIns)
1104  if (!In->isTerminator())
1105  promoteTo(In, DestTy, LoopB);
1106 
1107  // Fix up the PHI nodes in the exit block.
1108  Instruction *EndI = ExitB->getFirstNonPHI();
1109  BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
1110  for (auto I = ExitB->begin(); I != End; ++I) {
1111  PHINode *P = dyn_cast<PHINode>(I);
1112  if (!P)
1113  break;
1114  Type *Ty0 = P->getIncomingValue(0)->getType();
1115  Type *PTy = P->getType();
1116  if (PTy != Ty0) {
1117  assert(Ty0 == DestTy);
1118  // In order to create the trunc, P must have the promoted type.
1119  P->mutateType(Ty0);
1120  Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
1121  // In order for the RAUW to work, the types of P and T must match.
1122  P->mutateType(PTy);
1123  P->replaceAllUsesWith(T);
1124  // Final update of the P's type.
1125  P->mutateType(Ty0);
1126  cast<Instruction>(T)->setOperand(0, P);
1127  }
1128  }
1129 
1130  return true;
1131 }
1132 
1133 bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
1134  ValueSeq &Cycle) {
1135  // Out = ..., In, ...
1136  if (Out == In)
1137  return true;
1138 
1139  auto *BB = cast<Instruction>(Out)->getParent();
1140  bool HadPhi = false;
1141 
1142  for (auto U : Out->users()) {
1143  auto *I = dyn_cast<Instruction>(&*U);
1144  if (I == nullptr || I->getParent() != BB)
1145  continue;
1146  // Make sure that there are no multi-iteration cycles, e.g.
1147  // p1 = phi(p2)
1148  // p2 = phi(p1)
1149  // The cycle p1->p2->p1 would span two loop iterations.
1150  // Check that there is only one phi in the cycle.
1151  bool IsPhi = isa<PHINode>(I);
1152  if (IsPhi && HadPhi)
1153  return false;
1154  HadPhi |= IsPhi;
1155  if (Cycle.count(I))
1156  return false;
1157  Cycle.insert(I);
1158  if (findCycle(I, In, Cycle))
1159  break;
1160  Cycle.remove(I);
1161  }
1162  return !Cycle.empty();
1163 }
1164 
1165 void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
1166  ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
1167  // All the values in the cycle that are between the phi node and the
1168  // divider instruction will be classified as "early", all other values
1169  // will be "late".
1170 
1171  bool IsE = true;
1172  unsigned I, N = Cycle.size();
1173  for (I = 0; I < N; ++I) {
1174  Value *V = Cycle[I];
1175  if (DivI == V)
1176  IsE = false;
1177  else if (!isa<PHINode>(V))
1178  continue;
1179  // Stop if found either.
1180  break;
1181  }
1182  // "I" is the index of either DivI or the phi node, whichever was first.
1183  // "E" is "false" or "true" respectively.
1184  ValueSeq &First = !IsE ? Early : Late;
1185  for (unsigned J = 0; J < I; ++J)
1186  First.insert(Cycle[J]);
1187 
1188  ValueSeq &Second = IsE ? Early : Late;
1189  Second.insert(Cycle[I]);
1190  for (++I; I < N; ++I) {
1191  Value *V = Cycle[I];
1192  if (DivI == V || isa<PHINode>(V))
1193  break;
1194  Second.insert(V);
1195  }
1196 
1197  for (; I < N; ++I)
1198  First.insert(Cycle[I]);
1199 }
1200 
1201 bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
1202  ValueSeq &Early, ValueSeq &Late) {
1203  // Select is an exception, since the condition value does not have to be
1204  // classified in the same way as the true/false values. The true/false
1205  // values do have to be both early or both late.
1206  if (UseI->getOpcode() == Instruction::Select) {
1207  Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
1208  if (Early.count(TV) || Early.count(FV)) {
1209  if (Late.count(TV) || Late.count(FV))
1210  return false;
1211  Early.insert(UseI);
1212  } else if (Late.count(TV) || Late.count(FV)) {
1213  if (Early.count(TV) || Early.count(FV))
1214  return false;
1215  Late.insert(UseI);
1216  }
1217  return true;
1218  }
1219 
1220  // Not sure what would be the example of this, but the code below relies
1221  // on having at least one operand.
1222  if (UseI->getNumOperands() == 0)
1223  return true;
1224 
1225  bool AE = true, AL = true;
1226  for (auto &I : UseI->operands()) {
1227  if (Early.count(&*I))
1228  AL = false;
1229  else if (Late.count(&*I))
1230  AE = false;
1231  }
1232  // If the operands appear "all early" and "all late" at the same time,
1233  // then it means that none of them are actually classified as either.
1234  // This is harmless.
1235  if (AE && AL)
1236  return true;
1237  // Conversely, if they are neither "all early" nor "all late", then
1238  // we have a mixture of early and late operands that is not a known
1239  // exception.
1240  if (!AE && !AL)
1241  return false;
1242 
1243  // Check that we have covered the two special cases.
1244  assert(AE != AL);
1245 
1246  if (AE)
1247  Early.insert(UseI);
1248  else
1249  Late.insert(UseI);
1250  return true;
1251 }
1252 
1253 bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
1254  switch (I->getOpcode()) {
1255  case Instruction::And:
1256  case Instruction::Or:
1257  case Instruction::Xor:
1258  case Instruction::LShr:
1259  case Instruction::Shl:
1260  case Instruction::Select:
1261  case Instruction::ICmp:
1262  case Instruction::PHI:
1263  break;
1264  default:
1265  return false;
1266  }
1267  return true;
1268 }
1269 
1270 bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
1271  unsigned IterCount) {
1272  auto *T = dyn_cast<IntegerType>(V->getType());
1273  if (!T)
1274  return false;
1275 
1276  KnownBits Known(T->getBitWidth());
1277  computeKnownBits(V, Known, DL);
1278  return Known.countMinLeadingZeros() >= IterCount;
1279 }
1280 
1281 bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
1282  unsigned IterCount) {
1283  // Assume that all inputs to the value have the high bits zero.
1284  // Check if the value itself preserves the zeros in the high bits.
1285  if (auto *C = dyn_cast<ConstantInt>(V))
1286  return C->getValue().countLeadingZeros() >= IterCount;
1287 
1288  if (auto *I = dyn_cast<Instruction>(V)) {
1289  switch (I->getOpcode()) {
1290  case Instruction::And:
1291  case Instruction::Or:
1292  case Instruction::Xor:
1293  case Instruction::LShr:
1294  case Instruction::Select:
1295  case Instruction::ICmp:
1296  case Instruction::PHI:
1297  case Instruction::ZExt:
1298  return true;
1299  }
1300  }
1301 
1302  return false;
1303 }
1304 
1305 bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
1306  unsigned Opc = I->getOpcode();
1307  if (Opc == Instruction::Shl || Opc == Instruction::LShr)
1308  return Op != I->getOperand(1);
1309  return true;
1310 }
1311 
1312 bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
1313  BasicBlock *ExitB, unsigned IterCount) {
1314  Value *CIV = getCountIV(LoopB);
1315  if (CIV == nullptr)
1316  return false;
1317  auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
1318  if (CIVTy == nullptr)
1319  return false;
1320 
1321  ValueSeq RShifts;
1322  ValueSeq Early, Late, Cycled;
1323 
1324  // Find all value cycles that contain logical right shifts by 1.
1325  for (Instruction &I : *LoopB) {
1326  using namespace PatternMatch;
1327 
1328  Value *V = nullptr;
1329  if (!match(&I, m_LShr(m_Value(V), m_One())))
1330  continue;
1331  ValueSeq C;
1332  if (!findCycle(&I, V, C))
1333  continue;
1334 
1335  // Found a cycle.
1336  C.insert(&I);
1337  classifyCycle(&I, C, Early, Late);
1338  Cycled.insert(C.begin(), C.end());
1339  RShifts.insert(&I);
1340  }
1341 
1342  // Find the set of all values affected by the shift cycles, i.e. all
1343  // cycled values, and (recursively) all their users.
1344  ValueSeq Users(Cycled.begin(), Cycled.end());
1345  for (unsigned i = 0; i < Users.size(); ++i) {
1346  Value *V = Users[i];
1347  if (!isa<IntegerType>(V->getType()))
1348  return false;
1349  auto *R = cast<Instruction>(V);
1350  // If the instruction does not commute with shifts, the loop cannot
1351  // be unshifted.
1352  if (!commutesWithShift(R))
1353  return false;
1354  for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
1355  auto *T = cast<Instruction>(*I);
1356  // Skip users from outside of the loop. They will be handled later.
1357  // Also, skip the right-shifts and phi nodes, since they mix early
1358  // and late values.
1359  if (T->getParent() != LoopB || RShifts.count(T) || isa<PHINode>(T))
1360  continue;
1361 
1362  Users.insert(T);
1363  if (!classifyInst(T, Early, Late))
1364  return false;
1365  }
1366  }
1367 
1368  if (Users.empty())
1369  return false;
1370 
1371  // Verify that high bits remain zero.
1372  ValueSeq Internal(Users.begin(), Users.end());
1373  ValueSeq Inputs;
1374  for (unsigned i = 0; i < Internal.size(); ++i) {
1375  auto *R = dyn_cast<Instruction>(Internal[i]);
1376  if (!R)
1377  continue;
1378  for (Value *Op : R->operands()) {
1379  auto *T = dyn_cast<Instruction>(Op);
1380  if (T && T->getParent() != LoopB)
1381  Inputs.insert(Op);
1382  else
1383  Internal.insert(Op);
1384  }
1385  }
1386  for (Value *V : Inputs)
1387  if (!highBitsAreZero(V, IterCount))
1388  return false;
1389  for (Value *V : Internal)
1390  if (!keepsHighBitsZero(V, IterCount))
1391  return false;
1392 
1393  // Finally, the work can be done. Unshift each user.
1394  IRBuilder<> IRB(LoopB);
1395  std::map<Value*,Value*> ShiftMap;
1396 
1397  using CastMapType = std::map<std::pair<Value *, Type *>, Value *>;
1398 
1399  CastMapType CastMap;
1400 
1401  auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
1402  IntegerType *Ty) -> Value* {
1403  auto H = CM.find(std::make_pair(V, Ty));
1404  if (H != CM.end())
1405  return H->second;
1406  Value *CV = IRB.CreateIntCast(V, Ty, false);
1407  CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
1408  return CV;
1409  };
1410 
1411  for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
1412  using namespace PatternMatch;
1413 
1414  if (isa<PHINode>(I) || !Users.count(&*I))
1415  continue;
1416 
1417  // Match lshr x, 1.
1418  Value *V = nullptr;
1419  if (match(&*I, m_LShr(m_Value(V), m_One()))) {
1420  replaceAllUsesOfWithIn(&*I, V, LoopB);
1421  continue;
1422  }
1423  // For each non-cycled operand, replace it with the corresponding
1424  // value shifted left.
1425  for (auto &J : I->operands()) {
1426  Value *Op = J.get();
1427  if (!isOperandShifted(&*I, Op))
1428  continue;
1429  if (Users.count(Op))
1430  continue;
1431  // Skip shifting zeros.
1432  if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
1433  continue;
1434  // Check if we have already generated a shift for this value.
1435  auto F = ShiftMap.find(Op);
1436  Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
1437  if (W == nullptr) {
1438  IRB.SetInsertPoint(&*I);
1439  // First, the shift amount will be CIV or CIV+1, depending on
1440  // whether the value is early or late. Instead of creating CIV+1,
1441  // do a single shift of the value.
1442  Value *ShAmt = CIV, *ShVal = Op;
1443  auto *VTy = cast<IntegerType>(ShVal->getType());
1444  auto *ATy = cast<IntegerType>(ShAmt->getType());
1445  if (Late.count(&*I))
1446  ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
1447  // Second, the types of the shifted value and the shift amount
1448  // must match.
1449  if (VTy != ATy) {
1450  if (VTy->getBitWidth() < ATy->getBitWidth())
1451  ShVal = upcast(CastMap, IRB, ShVal, ATy);
1452  else
1453  ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
1454  }
1455  // Ready to generate the shift and memoize it.
1456  W = IRB.CreateShl(ShVal, ShAmt);
1457  ShiftMap.insert(std::make_pair(Op, W));
1458  }
1459  I->replaceUsesOfWith(Op, W);
1460  }
1461  }
1462 
1463  // Update the users outside of the loop to account for having left
1464  // shifts. They would normally be shifted right in the loop, so shift
1465  // them right after the loop exit.
1466  // Take advantage of the loop-closed SSA form, which has all the post-
1467  // loop values in phi nodes.
1468  IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
1469  for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
1470  if (!isa<PHINode>(P))
1471  break;
1472  auto *PN = cast<PHINode>(P);
1473  Value *U = PN->getIncomingValueForBlock(LoopB);
1474  if (!Users.count(U))
1475  continue;
1476  Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
1477  PN->replaceAllUsesWith(S);
1478  // The above RAUW will create
1479  // S = lshr S, IterCount
1480  // so we need to fix it back into
1481  // S = lshr PN, IterCount
1482  cast<User>(S)->replaceUsesOfWith(S, PN);
1483  }
1484 
1485  return true;
1486 }
1487 
1488 void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
1489  for (auto &I : *LoopB)
1490  if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
1491  I.replaceAllUsesWith(SV);
1492 
1493  for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
1494  N = std::next(I);
1496  }
1497 }
1498 
1499 unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
1500  // Arrays of coefficients of Q and the inverse, C.
1501  // Q[i] = coefficient at x^i.
1502  std::array<char,32> Q, C;
1503 
1504  for (unsigned i = 0; i < 32; ++i) {
1505  Q[i] = QP & 1;
1506  QP >>= 1;
1507  }
1508  assert(Q[0] == 1);
1509 
1510  // Find C, such that
1511  // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
1512  //
1513  // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
1514  // operations * and + are & and ^ respectively.
1515  //
1516  // Find C[i] recursively, by comparing i-th coefficient in the product
1517  // with 0 (or 1 for i=0).
1518  //
1519  // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
1520  C[0] = 1;
1521  for (unsigned i = 1; i < 32; ++i) {
1522  // Solve for C[i] in:
1523  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
1524  // This is equivalent to
1525  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
1526  // which is
1527  // C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
1528  unsigned T = 0;
1529  for (unsigned j = 0; j < i; ++j)
1530  T = T ^ (C[j] & Q[i-j]);
1531  C[i] = T;
1532  }
1533 
1534  unsigned QV = 0;
1535  for (unsigned i = 0; i < 32; ++i)
1536  if (C[i])
1537  QV |= (1 << i);
1538 
1539  return QV;
1540 }
1541 
1543  ParsedValues &PV) {
1544  IRBuilder<> B(&*At);
1545  Module *M = At->getParent()->getParent()->getParent();
1546  Function *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
1547 
1548  Value *P = PV.P, *Q = PV.Q, *P0 = P;
1549  unsigned IC = PV.IterCount;
1550 
1551  if (PV.M != nullptr)
1552  P0 = P = B.CreateXor(P, PV.M);
1553 
1554  // Create a bit mask to clear the high bits beyond IterCount.
1555  auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
1556 
1557  if (PV.IterCount != 32)
1558  P = B.CreateAnd(P, BMI);
1559 
1560  if (PV.Inv) {
1561  auto *QI = dyn_cast<ConstantInt>(PV.Q);
1562  assert(QI && QI->getBitWidth() <= 32);
1563 
1564  // Again, clearing bits beyond IterCount.
1565  unsigned M = (1 << PV.IterCount) - 1;
1566  unsigned Tmp = (QI->getZExtValue() | 1) & M;
1567  unsigned QV = getInverseMxN(Tmp) & M;
1568  auto *QVI = ConstantInt::get(QI->getType(), QV);
1569  P = B.CreateCall(PMF, {P, QVI});
1570  P = B.CreateTrunc(P, QI->getType());
1571  if (IC != 32)
1572  P = B.CreateAnd(P, BMI);
1573  }
1574 
1575  Value *R = B.CreateCall(PMF, {P, Q});
1576 
1577  if (PV.M != nullptr)
1578  R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
1579 
1580  return R;
1581 }
1582 
1583 static bool hasZeroSignBit(const Value *V) {
1584  if (const auto *CI = dyn_cast<const ConstantInt>(V))
1585  return (CI->getType()->getSignBit() & CI->getSExtValue()) == 0;
1586  const Instruction *I = dyn_cast<const Instruction>(V);
1587  if (!I)
1588  return false;
1589  switch (I->getOpcode()) {
1590  case Instruction::LShr:
1591  if (const auto SI = dyn_cast<const ConstantInt>(I->getOperand(1)))
1592  return SI->getZExtValue() > 0;
1593  return false;
1594  case Instruction::Or:
1595  case Instruction::Xor:
1596  return hasZeroSignBit(I->getOperand(0)) &&
1597  hasZeroSignBit(I->getOperand(1));
1598  case Instruction::And:
1599  return hasZeroSignBit(I->getOperand(0)) ||
1600  hasZeroSignBit(I->getOperand(1));
1601  }
1602  return false;
1603 }
1604 
1605 void PolynomialMultiplyRecognize::setupPreSimplifier(Simplifier &S) {
1606  S.addRule("sink-zext",
1607  // Sink zext past bitwise operations.
1608  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1609  if (I->getOpcode() != Instruction::ZExt)
1610  return nullptr;
1611  Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
1612  if (!T)
1613  return nullptr;
1614  switch (T->getOpcode()) {
1615  case Instruction::And:
1616  case Instruction::Or:
1617  case Instruction::Xor:
1618  break;
1619  default:
1620  return nullptr;
1621  }
1622  IRBuilder<> B(Ctx);
1623  return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
1624  B.CreateZExt(T->getOperand(0), I->getType()),
1625  B.CreateZExt(T->getOperand(1), I->getType()));
1626  });
1627  S.addRule("xor/and -> and/xor",
1628  // (xor (and x a) (and y a)) -> (and (xor x y) a)
1629  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1630  if (I->getOpcode() != Instruction::Xor)
1631  return nullptr;
1632  Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
1633  Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
1634  if (!And0 || !And1)
1635  return nullptr;
1636  if (And0->getOpcode() != Instruction::And ||
1637  And1->getOpcode() != Instruction::And)
1638  return nullptr;
1639  if (And0->getOperand(1) != And1->getOperand(1))
1640  return nullptr;
1641  IRBuilder<> B(Ctx);
1642  return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
1643  And0->getOperand(1));
1644  });
1645  S.addRule("sink binop into select",
1646  // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
1647  // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
1648  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1649  BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
1650  if (!BO)
1651  return nullptr;
1653  if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
1654  IRBuilder<> B(Ctx);
1655  Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
1656  Value *Z = BO->getOperand(1);
1657  return B.CreateSelect(Sel->getCondition(),
1658  B.CreateBinOp(Op, X, Z),
1659  B.CreateBinOp(Op, Y, Z));
1660  }
1661  if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
1662  IRBuilder<> B(Ctx);
1663  Value *X = BO->getOperand(0);
1664  Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
1665  return B.CreateSelect(Sel->getCondition(),
1666  B.CreateBinOp(Op, X, Y),
1667  B.CreateBinOp(Op, X, Z));
1668  }
1669  return nullptr;
1670  });
1671  S.addRule("fold select-select",
1672  // (select c (select c x y) z) -> (select c x z)
1673  // (select c x (select c y z)) -> (select c x z)
1674  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1675  SelectInst *Sel = dyn_cast<SelectInst>(I);
1676  if (!Sel)
1677  return nullptr;
1678  IRBuilder<> B(Ctx);
1679  Value *C = Sel->getCondition();
1680  if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
1681  if (Sel0->getCondition() == C)
1682  return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
1683  }
1684  if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
1685  if (Sel1->getCondition() == C)
1686  return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
1687  }
1688  return nullptr;
1689  });
1690  S.addRule("or-signbit -> xor-signbit",
1691  // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
1692  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1693  if (I->getOpcode() != Instruction::Or)
1694  return nullptr;
1695  ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
1696  if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
1697  return nullptr;
1698  if (!hasZeroSignBit(I->getOperand(0)))
1699  return nullptr;
1700  return IRBuilder<>(Ctx).CreateXor(I->getOperand(0), Msb);
1701  });
1702  S.addRule("sink lshr into binop",
1703  // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
1704  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1705  if (I->getOpcode() != Instruction::LShr)
1706  return nullptr;
1707  BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
1708  if (!BitOp)
1709  return nullptr;
1710  switch (BitOp->getOpcode()) {
1711  case Instruction::And:
1712  case Instruction::Or:
1713  case Instruction::Xor:
1714  break;
1715  default:
1716  return nullptr;
1717  }
1718  IRBuilder<> B(Ctx);
1719  Value *S = I->getOperand(1);
1720  return B.CreateBinOp(BitOp->getOpcode(),
1721  B.CreateLShr(BitOp->getOperand(0), S),
1722  B.CreateLShr(BitOp->getOperand(1), S));
1723  });
1724  S.addRule("expose bitop-const",
1725  // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
1726  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1727  auto IsBitOp = [](unsigned Op) -> bool {
1728  switch (Op) {
1729  case Instruction::And:
1730  case Instruction::Or:
1731  case Instruction::Xor:
1732  return true;
1733  }
1734  return false;
1735  };
1736  BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
1737  if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
1738  return nullptr;
1739  BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
1740  if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
1741  return nullptr;
1742  ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
1743  ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
1744  if (!CA || !CB)
1745  return nullptr;
1746  IRBuilder<> B(Ctx);
1747  Value *X = BitOp2->getOperand(0);
1748  return B.CreateBinOp(BitOp2->getOpcode(), X,
1749  B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
1750  });
1751 }
1752 
1753 void PolynomialMultiplyRecognize::setupPostSimplifier(Simplifier &S) {
1754  S.addRule("(and (xor (and x a) y) b) -> (and (xor x y) b), if b == b&a",
1755  [](Instruction *I, LLVMContext &Ctx) -> Value* {
1756  if (I->getOpcode() != Instruction::And)
1757  return nullptr;
1758  Instruction *Xor = dyn_cast<Instruction>(I->getOperand(0));
1759  ConstantInt *C0 = dyn_cast<ConstantInt>(I->getOperand(1));
1760  if (!Xor || !C0)
1761  return nullptr;
1762  if (Xor->getOpcode() != Instruction::Xor)
1763  return nullptr;
1764  Instruction *And0 = dyn_cast<Instruction>(Xor->getOperand(0));
1765  Instruction *And1 = dyn_cast<Instruction>(Xor->getOperand(1));
1766  // Pick the first non-null and.
1767  if (!And0 || And0->getOpcode() != Instruction::And)
1768  std::swap(And0, And1);
1769  ConstantInt *C1 = dyn_cast<ConstantInt>(And0->getOperand(1));
1770  if (!C1)
1771  return nullptr;
1772  uint32_t V0 = C0->getZExtValue();
1773  uint32_t V1 = C1->getZExtValue();
1774  if (V0 != (V0 & V1))
1775  return nullptr;
1776  IRBuilder<> B(Ctx);
1777  return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1), C0);
1778  });
1779 }
1780 
1781 bool PolynomialMultiplyRecognize::recognize() {
1782  LLVM_DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
1783  << *CurLoop << '\n');
1784  // Restrictions:
1785  // - The loop must consist of a single block.
1786  // - The iteration count must be known at compile-time.
1787  // - The loop must have an induction variable starting from 0, and
1788  // incremented in each iteration of the loop.
1789  BasicBlock *LoopB = CurLoop->getHeader();
1790  LLVM_DEBUG(dbgs() << "Loop header:\n" << *LoopB);
1791 
1792  if (LoopB != CurLoop->getLoopLatch())
1793  return false;
1794  BasicBlock *ExitB = CurLoop->getExitBlock();
1795  if (ExitB == nullptr)
1796  return false;
1797  BasicBlock *EntryB = CurLoop->getLoopPreheader();
1798  if (EntryB == nullptr)
1799  return false;
1800 
1801  unsigned IterCount = 0;
1802  const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
1803  if (isa<SCEVCouldNotCompute>(CT))
1804  return false;
1805  if (auto *CV = dyn_cast<SCEVConstant>(CT))
1806  IterCount = CV->getValue()->getZExtValue() + 1;
1807 
1808  Value *CIV = getCountIV(LoopB);
1809  ParsedValues PV;
1810  Simplifier PreSimp;
1811  PV.IterCount = IterCount;
1812  LLVM_DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount
1813  << '\n');
1814 
1815  setupPreSimplifier(PreSimp);
1816 
1817  // Perform a preliminary scan of select instructions to see if any of them
1818  // looks like a generator of the polynomial multiply steps. Assume that a
1819  // loop can only contain a single transformable operation, so stop the
1820  // traversal after the first reasonable candidate was found.
1821  // XXX: Currently this approach can modify the loop before being 100% sure
1822  // that the transformation can be carried out.
1823  bool FoundPreScan = false;
1824  auto FeedsPHI = [LoopB](const Value *V) -> bool {
1825  for (const Value *U : V->users()) {
1826  if (const auto *P = dyn_cast<const PHINode>(U))
1827  if (P->getParent() == LoopB)
1828  return true;
1829  }
1830  return false;
1831  };
1832  for (Instruction &In : *LoopB) {
1833  SelectInst *SI = dyn_cast<SelectInst>(&In);
1834  if (!SI || !FeedsPHI(SI))
1835  continue;
1836 
1838  Value *T = PreSimp.simplify(C);
1839  SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
1840  LLVM_DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
1841  if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
1842  FoundPreScan = true;
1843  if (SelI != SI) {
1844  Value *NewSel = C.materialize(LoopB, SI->getIterator());
1845  SI->replaceAllUsesWith(NewSel);
1847  }
1848  break;
1849  }
1850  }
1851 
1852  if (!FoundPreScan) {
1853  LLVM_DEBUG(dbgs() << "Have not found candidates for pmpy\n");
1854  return false;
1855  }
1856 
1857  if (!PV.Left) {
1858  // The right shift version actually only returns the higher bits of
1859  // the result (each iteration discards the LSB). If we want to convert it
1860  // to a left-shifting loop, the working data type must be at least as
1861  // wide as the target's pmpy instruction.
1862  if (!promoteTypes(LoopB, ExitB))
1863  return false;
1864  // Run post-promotion simplifications.
1865  Simplifier PostSimp;
1866  setupPostSimplifier(PostSimp);
1867  for (Instruction &In : *LoopB) {
1868  SelectInst *SI = dyn_cast<SelectInst>(&In);
1869  if (!SI || !FeedsPHI(SI))
1870  continue;
1872  Value *T = PostSimp.simplify(C);
1873  SelectInst *SelI = dyn_cast_or_null<SelectInst>(T);
1874  if (SelI != SI) {
1875  Value *NewSel = C.materialize(LoopB, SI->getIterator());
1876  SI->replaceAllUsesWith(NewSel);
1878  }
1879  break;
1880  }
1881 
1882  if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
1883  return false;
1884  cleanupLoopBody(LoopB);
1885  }
1886 
1887  // Scan the loop again, find the generating select instruction.
1888  bool FoundScan = false;
1889  for (Instruction &In : *LoopB) {
1890  SelectInst *SelI = dyn_cast<SelectInst>(&In);
1891  if (!SelI)
1892  continue;
1893  LLVM_DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
1894  FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
1895  if (FoundScan)
1896  break;
1897  }
1898  assert(FoundScan);
1899 
1900  LLVM_DEBUG({
1901  StringRef PP = (PV.M ? "(P+M)" : "P");
1902  if (!PV.Inv)
1903  dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
1904  else
1905  dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
1906  << PP << "\n";
1907  dbgs() << " Res:" << *PV.Res << "\n P:" << *PV.P << "\n";
1908  if (PV.M)
1909  dbgs() << " M:" << *PV.M << "\n";
1910  dbgs() << " Q:" << *PV.Q << "\n";
1911  dbgs() << " Iteration count:" << PV.IterCount << "\n";
1912  });
1913 
1914  BasicBlock::iterator At(EntryB->getTerminator());
1915  Value *PM = generate(At, PV);
1916  if (PM == nullptr)
1917  return false;
1918 
1919  if (PM->getType() != PV.Res->getType())
1920  PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
1921 
1922  PV.Res->replaceAllUsesWith(PM);
1923  PV.Res->eraseFromParent();
1924  return true;
1925 }
1926 
1927 int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
1928  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
1929  return SC->getAPInt().getSExtValue();
1930  return 0;
1931 }
1932 
1933 bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
1934  // Allow volatile stores if HexagonVolatileMemcpy is enabled.
1935  if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
1936  return false;
1937 
1938  Value *StoredVal = SI->getValueOperand();
1939  Value *StorePtr = SI->getPointerOperand();
1940 
1941  // Reject stores that are so large that they overflow an unsigned.
1942  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
1943  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
1944  return false;
1945 
1946  // See if the pointer expression is an AddRec like {base,+,1} on the current
1947  // loop, which indicates a strided store. If we have something else, it's a
1948  // random store we can't handle.
1949  auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1950  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
1951  return false;
1952 
1953  // Check to see if the stride matches the size of the store. If so, then we
1954  // know that every byte is touched in the loop.
1955  int Stride = getSCEVStride(StoreEv);
1956  if (Stride == 0)
1957  return false;
1958  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1959  if (StoreSize != unsigned(std::abs(Stride)))
1960  return false;
1961 
1962  // The store must be feeding a non-volatile load.
1963  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
1964  if (!LI || !LI->isSimple())
1965  return false;
1966 
1967  // See if the pointer expression is an AddRec like {base,+,1} on the current
1968  // loop, which indicates a strided load. If we have something else, it's a
1969  // random load we can't handle.
1970  Value *LoadPtr = LI->getPointerOperand();
1971  auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
1972  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
1973  return false;
1974 
1975  // The store and load must share the same stride.
1976  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
1977  return false;
1978 
1979  // Success. This store can be converted into a memcpy.
1980  return true;
1981 }
1982 
1983 /// mayLoopAccessLocation - Return true if the specified loop might access the
1984 /// specified pointer location, which is a loop-strided access. The 'Access'
1985 /// argument specifies what the verboten forms of access are (read or write).
1986 static bool
1988  const SCEV *BECount, unsigned StoreSize,
1989  AliasAnalysis &AA,
1990  SmallPtrSetImpl<Instruction *> &Ignored) {
1991  // Get the location that may be stored across the loop. Since the access
1992  // is strided positively through memory, we say that the modified location
1993  // starts at the pointer and has infinite size.
1995 
1996  // If the loop iterates a fixed number of times, we can refine the access
1997  // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
1998  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
1999  AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
2000  StoreSize);
2001 
2002  // TODO: For this to be really effective, we have to dive into the pointer
2003  // operand in the store. Store to &A[i] of 100 will always return may alias
2004  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
2005  // which will then no-alias a store to &A[100].
2006  MemoryLocation StoreLoc(Ptr, AccessSize);
2007 
2008  for (auto *B : L->blocks())
2009  for (auto &I : *B)
2010  if (Ignored.count(&I) == 0 &&
2011  isModOrRefSet(
2012  intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
2013  return true;
2014 
2015  return false;
2016 }
2017 
2018 void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
2019  SmallVectorImpl<StoreInst*> &Stores) {
2020  Stores.clear();
2021  for (Instruction &I : *BB)
2022  if (StoreInst *SI = dyn_cast<StoreInst>(&I))
2023  if (isLegalStore(CurLoop, SI))
2024  Stores.push_back(SI);
2025 }
2026 
2027 bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
2028  StoreInst *SI, const SCEV *BECount) {
2029  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
2030  "Expected only non-volatile stores, or Hexagon-specific memcpy"
2031  "to volatile destination.");
2032 
2033  Value *StorePtr = SI->getPointerOperand();
2034  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
2035  unsigned Stride = getSCEVStride(StoreEv);
2036  unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
2037  if (Stride != StoreSize)
2038  return false;
2039 
2040  // See if the pointer expression is an AddRec like {base,+,1} on the current
2041  // loop, which indicates a strided load. If we have something else, it's a
2042  // random load we can't handle.
2043  auto *LI = cast<LoadInst>(SI->getValueOperand());
2044  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
2045 
2046  // The trip count of the loop and the base pointer of the addrec SCEV is
2047  // guaranteed to be loop invariant, which means that it should dominate the
2048  // header. This allows us to insert code for it in the preheader.
2049  BasicBlock *Preheader = CurLoop->getLoopPreheader();
2050  Instruction *ExpPt = Preheader->getTerminator();
2051  IRBuilder<> Builder(ExpPt);
2052  SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
2053 
2054  Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
2055 
2056  // Okay, we have a strided store "p[i]" of a loaded value. We can turn
2057  // this into a memcpy/memmove in the loop preheader now if we want. However,
2058  // this would be unsafe to do if there is anything else in the loop that may
2059  // read or write the memory region we're storing to. For memcpy, this
2060  // includes the load that feeds the stores. Check for an alias by generating
2061  // the base address and checking everything.
2062  Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
2063  Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
2064  Value *LoadBasePtr = nullptr;
2065 
2066  bool Overlap = false;
2067  bool DestVolatile = SI->isVolatile();
2068  Type *BECountTy = BECount->getType();
2069 
2070  if (DestVolatile) {
2071  // The trip count must fit in i32, since it is the type of the "num_words"
2072  // argument to hexagon_memcpy_forward_vp4cp4n2.
2073  if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
2074 CleanupAndExit:
2075  // If we generated new code for the base pointer, clean up.
2076  Expander.clear();
2077  if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
2078  RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
2079  StoreBasePtr = nullptr;
2080  }
2081  if (LoadBasePtr) {
2083  LoadBasePtr = nullptr;
2084  }
2085  return false;
2086  }
2087  }
2088 
2090  Ignore1.insert(SI);
2091  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
2092  StoreSize, *AA, Ignore1)) {
2093  // Check if the load is the offending instruction.
2094  Ignore1.insert(LI);
2095  if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
2096  BECount, StoreSize, *AA, Ignore1)) {
2097  // Still bad. Nothing we can do.
2098  goto CleanupAndExit;
2099  }
2100  // It worked with the load ignored.
2101  Overlap = true;
2102  }
2103 
2104  if (!Overlap) {
2105  if (DisableMemcpyIdiom || !HasMemcpy)
2106  goto CleanupAndExit;
2107  } else {
2108  // Don't generate memmove if this function will be inlined. This is
2109  // because the caller will undergo this transformation after inlining.
2110  Function *Func = CurLoop->getHeader()->getParent();
2111  if (Func->hasFnAttribute(Attribute::AlwaysInline))
2112  goto CleanupAndExit;
2113 
2114  // In case of a memmove, the call to memmove will be executed instead
2115  // of the loop, so we need to make sure that there is nothing else in
2116  // the loop than the load, store and instructions that these two depend
2117  // on.
2119  Insts.push_back(SI);
2120  Insts.push_back(LI);
2121  if (!coverLoop(CurLoop, Insts))
2122  goto CleanupAndExit;
2123 
2124  if (DisableMemmoveIdiom || !HasMemmove)
2125  goto CleanupAndExit;
2126  bool IsNested = CurLoop->getParentLoop() != nullptr;
2127  if (IsNested && OnlyNonNestedMemmove)
2128  goto CleanupAndExit;
2129  }
2130 
2131  // For a memcpy, we have to make sure that the input array is not being
2132  // mutated by the loop.
2133  LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
2134  Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
2135 
2137  Ignore2.insert(SI);
2138  if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
2139  StoreSize, *AA, Ignore2))
2140  goto CleanupAndExit;
2141 
2142  // Check the stride.
2143  bool StridePos = getSCEVStride(LoadEv) >= 0;
2144 
2145  // Currently, the volatile memcpy only emulates traversing memory forward.
2146  if (!StridePos && DestVolatile)
2147  goto CleanupAndExit;
2148 
2149  bool RuntimeCheck = (Overlap || DestVolatile);
2150 
2151  BasicBlock *ExitB;
2152  if (RuntimeCheck) {
2153  // The runtime check needs a single exit block.
2154  SmallVector<BasicBlock*, 8> ExitBlocks;
2155  CurLoop->getUniqueExitBlocks(ExitBlocks);
2156  if (ExitBlocks.size() != 1)
2157  goto CleanupAndExit;
2158  ExitB = ExitBlocks[0];
2159  }
2160 
2161  // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
2162  // pointer size if it isn't already.
2163  LLVMContext &Ctx = SI->getContext();
2164  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
2165  DebugLoc DLoc = SI->getDebugLoc();
2166 
2167  const SCEV *NumBytesS =
2168  SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
2169  if (StoreSize != 1)
2170  NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
2171  SCEV::FlagNUW);
2172  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
2173  if (Instruction *In = dyn_cast<Instruction>(NumBytes))
2174  if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
2175  NumBytes = Simp;
2176 
2177  CallInst *NewCall;
2178 
2179  if (RuntimeCheck) {
2181  if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
2182  uint64_t C = CI->getZExtValue();
2183  if (Threshold != 0 && C < Threshold)
2184  goto CleanupAndExit;
2186  goto CleanupAndExit;
2187  }
2188 
2189  BasicBlock *Header = CurLoop->getHeader();
2190  Function *Func = Header->getParent();
2191  Loop *ParentL = LF->getLoopFor(Preheader);
2192  StringRef HeaderName = Header->getName();
2193 
2194  // Create a new (empty) preheader, and update the PHI nodes in the
2195  // header to use the new preheader.
2196  BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
2197  Func, Header);
2198  if (ParentL)
2199  ParentL->addBasicBlockToLoop(NewPreheader, *LF);
2200  IRBuilder<>(NewPreheader).CreateBr(Header);
2201  for (auto &In : *Header) {
2202  PHINode *PN = dyn_cast<PHINode>(&In);
2203  if (!PN)
2204  break;
2205  int bx = PN->getBasicBlockIndex(Preheader);
2206  if (bx >= 0)
2207  PN->setIncomingBlock(bx, NewPreheader);
2208  }
2209  DT->addNewBlock(NewPreheader, Preheader);
2210  DT->changeImmediateDominator(Header, NewPreheader);
2211 
2212  // Check for safe conditions to execute memmove.
2213  // If stride is positive, copying things from higher to lower addresses
2214  // is equivalent to memmove. For negative stride, it's the other way
2215  // around. Copying forward in memory with positive stride may not be
2216  // same as memmove since we may be copying values that we just stored
2217  // in some previous iteration.
2218  Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
2219  Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
2220  Value *LowA = StridePos ? SA : LA;
2221  Value *HighA = StridePos ? LA : SA;
2222  Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
2223  Value *Cond = CmpA;
2224 
2225  // Check for distance between pointers. Since the case LowA < HighA
2226  // is checked for above, assume LowA >= HighA.
2227  Value *Dist = Builder.CreateSub(LowA, HighA);
2228  Value *CmpD = Builder.CreateICmpSLE(NumBytes, Dist);
2229  Value *CmpEither = Builder.CreateOr(Cond, CmpD);
2230  Cond = CmpEither;
2231 
2232  if (Threshold != 0) {
2233  Type *Ty = NumBytes->getType();
2234  Value *Thr = ConstantInt::get(Ty, Threshold);
2235  Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
2236  Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
2237  Cond = CmpBoth;
2238  }
2239  BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
2240  Func, NewPreheader);
2241  if (ParentL)
2242  ParentL->addBasicBlockToLoop(MemmoveB, *LF);
2243  Instruction *OldT = Preheader->getTerminator();
2244  Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
2245  OldT->eraseFromParent();
2246  Preheader->setName(Preheader->getName()+".old");
2247  DT->addNewBlock(MemmoveB, Preheader);
2248  // Find the new immediate dominator of the exit block.
2249  BasicBlock *ExitD = Preheader;
2250  for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
2251  BasicBlock *PB = *PI;
2252  ExitD = DT->findNearestCommonDominator(ExitD, PB);
2253  if (!ExitD)
2254  break;
2255  }
2256  // If the prior immediate dominator of ExitB was dominated by the
2257  // old preheader, then the old preheader becomes the new immediate
2258  // dominator. Otherwise don't change anything (because the newly
2259  // added blocks are dominated by the old preheader).
2260  if (ExitD && DT->dominates(Preheader, ExitD)) {
2261  DomTreeNode *BN = DT->getNode(ExitB);
2262  DomTreeNode *DN = DT->getNode(ExitD);
2263  BN->setIDom(DN);
2264  }
2265 
2266  // Add a call to memmove to the conditional block.
2267  IRBuilder<> CondBuilder(MemmoveB);
2268  CondBuilder.CreateBr(ExitB);
2269  CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
2270 
2271  if (DestVolatile) {
2272  Type *Int32Ty = Type::getInt32Ty(Ctx);
2273  Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
2274  Type *VoidTy = Type::getVoidTy(Ctx);
2275  Module *M = Func->getParent();
2276  FunctionCallee Fn = M->getOrInsertFunction(
2277  HexagonVolatileMemcpyName, VoidTy, Int32PtrTy, Int32PtrTy, Int32Ty);
2278 
2279  const SCEV *OneS = SE->getConstant(Int32Ty, 1);
2280  const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
2281  const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
2282  Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
2283  MemmoveB->getTerminator());
2284  if (Instruction *In = dyn_cast<Instruction>(NumWords))
2285  if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
2286  NumWords = Simp;
2287 
2288  Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
2289  ? StoreBasePtr
2290  : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
2291  Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
2292  ? LoadBasePtr
2293  : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
2294  NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
2295  } else {
2296  NewCall = CondBuilder.CreateMemMove(
2297  StoreBasePtr, SI->getAlign(), LoadBasePtr, LI->getAlign(), NumBytes);
2298  }
2299  } else {
2300  NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
2301  LI->getAlign(), NumBytes);
2302  // Okay, the memcpy has been formed. Zap the original store and
2303  // anything that feeds into it.
2305  }
2306 
2307  NewCall->setDebugLoc(DLoc);
2308 
2309  LLVM_DEBUG(dbgs() << " Formed " << (Overlap ? "memmove: " : "memcpy: ")
2310  << *NewCall << "\n"
2311  << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
2312  << " from store ptr=" << *StoreEv << " at: " << *SI
2313  << "\n");
2314 
2315  return true;
2316 }
2317 
2318 // Check if the instructions in Insts, together with their dependencies
2319 // cover the loop in the sense that the loop could be safely eliminated once
2320 // the instructions in Insts are removed.
2321 bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
2322  SmallVectorImpl<Instruction*> &Insts) const {
2323  SmallSet<BasicBlock*,8> LoopBlocks;
2324  for (auto *B : L->blocks())
2325  LoopBlocks.insert(B);
2326 
2327  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
2328 
2329  // Collect all instructions from the loop that the instructions in Insts
2330  // depend on (plus their dependencies, etc.). These instructions will
2331  // constitute the expression trees that feed those in Insts, but the trees
2332  // will be limited only to instructions contained in the loop.
2333  for (unsigned i = 0; i < Worklist.size(); ++i) {
2334  Instruction *In = Worklist[i];
2335  for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
2336  Instruction *OpI = dyn_cast<Instruction>(I);
2337  if (!OpI)
2338  continue;
2339  BasicBlock *PB = OpI->getParent();
2340  if (!LoopBlocks.count(PB))
2341  continue;
2342  Worklist.insert(OpI);
2343  }
2344  }
2345 
2346  // Scan all instructions in the loop, if any of them have a user outside
2347  // of the loop, or outside of the expressions collected above, then either
2348  // the loop has a side-effect visible outside of it, or there are
2349  // instructions in it that are not involved in the original set Insts.
2350  for (auto *B : L->blocks()) {
2351  for (auto &In : *B) {
2352  if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
2353  continue;
2354  if (!Worklist.count(&In) && In.mayHaveSideEffects())
2355  return false;
2356  for (auto K : In.users()) {
2357  Instruction *UseI = dyn_cast<Instruction>(K);
2358  if (!UseI)
2359  continue;
2360  BasicBlock *UseB = UseI->getParent();
2361  if (LF->getLoopFor(UseB) != L)
2362  return false;
2363  }
2364  }
2365  }
2366 
2367  return true;
2368 }
2369 
2370 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
2371 /// with the specified backedge count. This block is known to be in the current
2372 /// loop and not in any subloops.
2373 bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
2374  const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
2375  // We can only promote stores in this block if they are unconditionally
2376  // executed in the loop. For a block to be unconditionally executed, it has
2377  // to dominate all the exit blocks of the loop. Verify this now.
2378  auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
2379  return DT->dominates(BB, EB);
2380  };
2381  if (!all_of(ExitBlocks, DominatedByBB))
2382  return false;
2383 
2384  bool MadeChange = false;
2385  // Look for store instructions, which may be optimized to memset/memcpy.
2387  collectStores(CurLoop, BB, Stores);
2388 
2389  // Optimize the store into a memcpy, if it feeds an similarly strided load.
2390  for (auto &SI : Stores)
2391  MadeChange |= processCopyingStore(CurLoop, SI, BECount);
2392 
2393  return MadeChange;
2394 }
2395 
2396 bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
2397  PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
2398  if (PMR.recognize())
2399  return true;
2400 
2401  if (!HasMemcpy && !HasMemmove)
2402  return false;
2403 
2404  const SCEV *BECount = SE->getBackedgeTakenCount(L);
2405  assert(!isa<SCEVCouldNotCompute>(BECount) &&
2406  "runOnCountableLoop() called on a loop without a predictable"
2407  "backedge-taken count");
2408 
2409  SmallVector<BasicBlock *, 8> ExitBlocks;
2410  L->getUniqueExitBlocks(ExitBlocks);
2411 
2412  bool Changed = false;
2413 
2414  // Scan all the blocks in the loop that are not in subloops.
2415  for (auto *BB : L->getBlocks()) {
2416  // Ignore blocks in subloops.
2417  if (LF->getLoopFor(BB) != L)
2418  continue;
2419  Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
2420  }
2421 
2422  return Changed;
2423 }
2424 
2425 bool HexagonLoopIdiomRecognize::run(Loop *L) {
2426  const Module &M = *L->getHeader()->getParent()->getParent();
2427  if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
2428  return false;
2429 
2430  // If the loop could not be converted to canonical form, it must have an
2431  // indirectbr in it, just give up.
2432  if (!L->getLoopPreheader())
2433  return false;
2434 
2435  // Disable loop idiom recognition if the function's name is a common idiom.
2436  StringRef Name = L->getHeader()->getParent()->getName();
2437  if (Name == "memset" || Name == "memcpy" || Name == "memmove")
2438  return false;
2439 
2440  DL = &L->getHeader()->getModule()->getDataLayout();
2441 
2442  HasMemcpy = TLI->has(LibFunc_memcpy);
2443  HasMemmove = TLI->has(LibFunc_memmove);
2444 
2445  if (SE->hasLoopInvariantBackedgeTakenCount(L))
2446  return runOnCountableLoop(L);
2447  return false;
2448 }
2449 
2450 bool HexagonLoopIdiomRecognizeLegacyPass::runOnLoop(Loop *L,
2451  LPPassManager &LPM) {
2452  if (skipLoop(L))
2453  return false;
2454 
2455  auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2456  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2457  auto *LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2458  auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
2459  *L->getHeader()->getParent());
2460  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2461  return HexagonLoopIdiomRecognize(AA, DT, LF, TLI, SE).run(L);
2462 }
2463 
2465  return new HexagonLoopIdiomRecognizeLegacyPass();
2466 }
2467 
2471  LPMUpdater &U) {
2472  return HexagonLoopIdiomRecognize(&AR.AA, &AR.DT, &AR.LI, &AR.TLI, &AR.SE)
2473  .run(&L)
2476 }
i
i
Definition: README.txt:29
use
Move duplicate certain instructions close to their use
Definition: Localizer.cpp:31
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
Int32Ty
IntegerType * Int32Ty
Definition: NVVMIntrRange.cpp:67
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:511
llvm::IRBuilderBase::CreateIntCast
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2186
LLVM_ATTRIBUTE_USED
#define LLVM_ATTRIBUTE_USED
Definition: Compiler.h:144
llvm::BasicBlock::end
iterator end()
Definition: BasicBlock.h:298
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::AArch64CC::AL
@ AL
Definition: AArch64BaseInfo.h:269
llvm::createHexagonLoopIdiomPass
Pass * createHexagonLoopIdiomPass()
Definition: HexagonLoopIdiomRecognition.cpp:2464
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:741
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::LocationSize::afterPointer
constexpr static LocationSize afterPointer()
Any location after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:123
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:720
llvm::User::operands
op_range operands()
Definition: User.h:242
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::ConstantInt::getType
IntegerType * getType() const
getType - Specialize the getType() method to always return an IntegerType, which reduces the amount o...
Definition: Constants.h:173
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:190
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
IntrinsicInst.h
ScalarEvolutionExpander.h
simplify
hexagon bit simplify
Definition: HexagonBitSimplify.cpp:261
Scalar.h
T
llvm::Function
Definition: Function.h:62
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
StringRef.h
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1147
llvm::SCEVExpander
This class uses information about analyze scalars to rewrite expressions in canonical form.
Definition: ScalarEvolutionExpander.h:65
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::IRBuilderBase::CreateXor
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1414
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::RISCVFenceField::W
@ W
Definition: RISCVBaseInfo.h:199
llvm::User::dropAllReferences
void dropAllReferences()
Drop all references to operands.
Definition: User.h:299
RuntimeMemSizeThreshold
static cl::opt< unsigned > RuntimeMemSizeThreshold("runtime-mem-idiom-threshold", cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime " "check guarding the memmove."))
ErrorHandling.h
llvm::IRBuilder<>
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:742
ValueTracking.h
Local.h
llvm::IRBuilderBase::CreateBr
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:989
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
cleanup
static void cleanup(BlockFrequencyInfoImplBase &BFI)
Clear all memory not needed downstream.
Definition: BlockFrequencyInfoImpl.cpp:304
APInt.h
ScalarEvolution.h
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
DenseMap.h
Module.h
HexagonVolatileMemcpy
static cl::opt< bool > HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy", cl::Hidden, cl::init(false), cl::desc("Enable Hexagon-specific memcpy for volatile destination."))
llvm::LoopInfoWrapperPass
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:1268
llvm::LoopStandardAnalysisResults
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Definition: LoopAnalysisManager.h:52
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
llvm::Triple::hexagon
@ hexagon
Definition: Triple.h:60
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1787
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397
idiom
hexagon loop idiom
Definition: HexagonLoopIdiomRecognition.cpp:285
llvm::BasicBlock::getSinglePredecessor
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:268
initialize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Definition: TargetLibraryInfo.cpp:116
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::LoopStandardAnalysisResults::DT
DominatorTree & DT
Definition: LoopAnalysisManager.h:55
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:116
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::Type::getInt32PtrTy
static PointerType * getInt32PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:301
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:241
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:198
KnownBits.h
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::lltok::equal
@ equal
Definition: LLToken.h:25
LoopAnalysisManager.h
AliasAnalysis.h
Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:66
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::AlignStyle::Left
@ Left
Instruction.h
CommandLine.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
idioms
hexagon loop Recognize Hexagon specific loop idioms
Definition: HexagonLoopIdiomRecognition.cpp:286
llvm::LoopBase::getParentLoop
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Definition: LoopInfo.h:113
hasZeroSignBit
static bool hasZeroSignBit(const Value *V)
Definition: HexagonLoopIdiomRecognition.cpp:1583
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1551
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1785
llvm::RecurKind::And
@ And
Bitwise or logical AND of integers.
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::PPCISD::SC
@ SC
CHAIN = SC CHAIN, Imm128 - System call.
Definition: PPCISelLowering.h:410
CompileTimeMemSizeThreshold
static cl::opt< unsigned > CompileTimeMemSizeThreshold("compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64), cl::desc("Threshold (in bytes) to perform the transformation, if the " "runtime loop count (mem transfer size) is known at compile-time."))
Constants.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
isZero
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:519
llvm::AAResults
Definition: AliasAnalysis.h:508
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::HexagonLoopIdiomRecognitionPass::run
PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
Definition: HexagonLoopIdiomRecognition.cpp:2469
llvm::LocationSize
Definition: MemoryLocation.h:65
Intrinsics.h
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
InstrTypes.h
llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:296
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:199
llvm::replace
void replace(Container &Cont, typename Container::iterator ContIt, typename Container::iterator ContEnd, RandomAccessIterator ValIt, RandomAccessIterator ValEnd)
Given a sequence container Cont, replace the range [ContIt, ContEnd) with the range [ValIt,...
Definition: STLExtras.h:1755
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:178
TargetLibraryInfo.h
llvm::BasicBlock::getFirstInsertionPt
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:253
false
Definition: StackSlotColoring.cpp:142
llvm::PatternMatch::m_ZExt
CastClass_match< OpTy, Instruction::ZExt > m_ZExt(const OpTy &Op)
Matches ZExt.
Definition: PatternMatch.h:1639
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:393
llvm::Instruction
Definition: Instruction.h:45
llvm::SimplifyInstruction
Value * SimplifyInstruction(Instruction *I, const SimplifyQuery &Q, OptimizationRemarkEmitter *ORE=nullptr)
See if we can compute a simplified version of this instruction.
Definition: InstructionSimplify.cpp:6328
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:287
llvm::PassRegistry
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:38
llvm::BasicBlock::phis
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:354
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
llvm::Value::setName
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:376
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::LocationSize::precise
static LocationSize precise(uint64_t Value)
Definition: MemoryLocation.h:100
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
llvm::Use::getUser
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:73
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2084
DebugLoc.h
llvm::LPPassManager
Definition: LoopPass.h:75
SmallPtrSet.h
llvm::BasicBlock::getModule
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:148
DisableMemcpyIdiom
static cl::opt< bool > DisableMemcpyIdiom("disable-memcpy-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memcpy in loop idiom recognition"))
PatternMatch.h
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:216
llvm::IntegerType::getSignBit
uint64_t getSignBit() const
Return a uint64_t with just the most significant bit set (the sign bit, if the value is treated as a ...
Definition: DerivedTypes.h:82
llvm::Triple::getArch
ArchType getArch() const
getArch - Get the parsed architecture type of this triple.
Definition: Triple.h:310
Utils.h
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
OnlyNonNestedMemmove
static cl::opt< bool > OnlyNonNestedMemmove("only-nonnested-memmove-idiom", cl::Hidden, cl::init(true), cl::desc("Only enable generating memmove in non-nested loops"))
llvm::Value::use_empty
bool use_empty() const
Definition: Value.h:344
Type.h
llvm::IRBuilderBase::CreateAnd
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1362
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::PatternMatch::m_One
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:513
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
LoopInfo.h
llvm::IRBuilderBase::CreateZExt
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2025
llvm::CmpInst
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:710
llvm::PatternMatch::m_Xor
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1135
PB
PassBuilder PB(Machine, PassOpts->PTO, None, &PIC)
llvm::tgtok::In
@ In
Definition: TGLexer.h:51
llvm::getLoopPassPreservedAnalyses
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
Definition: LoopAnalysisManager.cpp:140
BasicBlock.h
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:273
llvm::SCEV
This class represents an analyzed expression in the program.
Definition: ScalarEvolution.h:77
llvm::PatternMatch::m_Zero
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:535
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:304
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:465
uint64_t
llvm::LoopPass
Definition: LoopPass.h:27
llvm::equal
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition: STLExtras.h:1710
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:578
llvm::LPMUpdater
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
Definition: LoopPassManager.h:263
llvm::find
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1571
llvm::LoopBase::getUniqueExitBlocks
void getUniqueExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all unique successor blocks of this loop.
Definition: LoopInfoImpl.h:122
llvm::TruncInst
This class represents a truncation of integer types.
Definition: Instructions.h:4753
llvm::PHINode::setIncomingBlock
void setIncomingBlock(unsigned i, BasicBlock *BB)
Definition: Instructions.h:2768
llvm::initializeHexagonLoopIdiomRecognizeLegacyPassPass
void initializeHexagonLoopIdiomRecognizeLegacyPassPass(PassRegistry &)
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
MemoryLocation.h
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::PHINode::getBasicBlockIndex
int getBasicBlockIndex(const BasicBlock *BB) const
Return the first index of the specified basic block in the value list for this PHI.
Definition: Instructions.h:2811
llvm::SelectInst::getTrueValue
const Value * getTrueValue() const
Definition: Instructions.h:1786
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:167
llvm::PatternMatch::m_And
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1123
llvm::Instruction::setDebugLoc
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:367
llvm::SCEVConstant
This class represents a constant integer value.
Definition: ScalarEvolutionExpressions.h:47
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognizeLegacyPass, "hexagon-loop-idiom", "Recognize Hexagon-specific loop idioms", false, false) INITIALIZE_PASS_END(HexagonLoopIdiomRecognizeLegacyPass
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:213
llvm::ModRefInfo
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition: AliasAnalysis.h:149
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1738
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:138
llvm::Record
Definition: Record.h:1472
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
llvm::ZExtInst
This class represents zero extension of integer types.
Definition: Instructions.h:4792
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Triple.h
llvm::LoopInfo
Definition: LoopInfo.h:1083
llvm::BinaryOperator
Definition: InstrTypes.h:189
mayLoopAccessLocation
static bool mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, const SCEV *BECount, unsigned StoreSize, AliasAnalysis &AA, SmallPtrSetImpl< Instruction * > &Ignored)
mayLoopAccessLocation - Return true if the specified loop might access the specified pointer location...
Definition: HexagonLoopIdiomRecognition.cpp:1987
QP
So we should use XX3Form_Rcr to implement instrinsic Convert DP QP
Definition: README_P9.txt:329
DataLayout.h
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::LoadInst::isSimple
bool isSimple() const
Definition: Instructions.h:259
LoopPass.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
if
if(llvm_vc STREQUAL "") set(fake_version_inc "$
Definition: CMakeLists.txt:14
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::BasicBlock::Create
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:100
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:867
uint32_t
Compiler.h
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::SCEV::FlagNUW
@ FlagNUW
Definition: ScalarEvolution.h:135
llvm::LoopStandardAnalysisResults::LI
LoopInfo & LI
Definition: LoopAnalysisManager.h:56
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::SmallSet::insert
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:180
llvm::PredIterator
Definition: CFG.h:43
llvm::ifs::IFSSymbolType::Func
@ Func
llvm::DomTreeNodeBase< BasicBlock >
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
replaceAllUsesOfWithIn
static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB)
Definition: HexagonLoopIdiomRecognition.cpp:675
llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:152
llvm::IRBuilderBase::CreateTrunc
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2021
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:142
llvm::MCID::Select
@ Select
Definition: MCInstrDesc.h:162
Attributes.h
j
return j(j<< 16)
llvm::LCSSAID
char & LCSSAID
Definition: LCSSA.cpp:485
HexagonLoopIdiomRecognition.h
Constant.h
llvm::DomTreeNodeBase::setIDom
void setIDom(DomTreeNodeBase *NewIDom)
Definition: GenericDomTree.h:123
llvm::LoopSimplifyID
char & LoopSimplifyID
Definition: LoopSimplify.cpp:800
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:254
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:52
llvm::KnownBits
Definition: KnownBits.h:23
H
#define H(x, y, z)
Definition: MD5.cpp:58
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:324
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
llvm::pred_end
Interval::pred_iterator pred_end(Interval *I)
Definition: Interval.h:112
llvm::SCEVAddRecExpr
This node represents a polynomial recurrence on the trip count of the specified loop.
Definition: ScalarEvolutionExpressions.h:352
Casting.h
Function.h
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
PassManager.h
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:221
llvm::ARCCC::Z
@ Z
Definition: ARCInfo.h:41
transform
instcombine should handle this transform
Definition: README.txt:262
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::LoopStandardAnalysisResults::SE
ScalarEvolution & SE
Definition: LoopAnalysisManager.h:57
llvm::LoopStandardAnalysisResults::AA
AAResults & AA
Definition: LoopAnalysisManager.h:53
ScalarEvolutionExpressions.h
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:785
llvm::pred_begin
Interval::pred_iterator pred_begin(Interval *I)
pred_begin/pred_end - define methods so that Intervals may be used just like BasicBlocks can with the...
Definition: Interval.h:109
llvm::Type::getVoidTy
static Type * getVoidTy(LLVMContext &C)
Definition: Type.cpp:224
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
llvm::User::getNumOperands
unsigned getNumOperands() const
Definition: User.h:191
SmallVector.h
llvm::PatternMatch::m_Specific
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:802
User.h
llvm::LoopBase::addBasicBlockToLoop
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
Definition: LoopInfoImpl.h:242
llvm::PatternMatch::m_ICmp
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Definition: PatternMatch.h:1404
Dominators.h
N
#define N
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1336
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
InstructionSimplify.h
llvm::Value::deleteValue
void deleteValue()
Delete a pointer to a generic Value.
Definition: Value.cpp:110
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
Users
iv Induction Variable Users
Definition: IVUsers.cpp:52
generate
We currently generate
Definition: README.txt:597
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
llvm::codeview::CompileSym3Flags::Exp
@ Exp
llvm::PHINode
Definition: Instructions.h:2633
Threshold
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
DerivedTypes.h
llvm::SmallPtrSetImpl< Instruction * >
llvm::SCEV::getType
Type * getType() const
Return the LLVM type of this SCEV expression.
Definition: ScalarEvolution.cpp:379
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:289
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:313
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
llvm::AnalysisUsage::addRequiredID
AnalysisUsage & addRequiredID(const void *ID)
Definition: Pass.cpp:267
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
SimplifyLimit
static cl::opt< unsigned > SimplifyLimit("hlir-simplify-limit", cl::init(10000), cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"))
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:412
raw_ostream.h
n
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality for n
Definition: README.txt:685
HexagonVolatileMemcpyName
static const char * HexagonVolatileMemcpyName
Definition: HexagonLoopIdiomRecognition.cpp:109
llvm::SetVector< Value * >
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:237
Value.h
llvm::pdb::PDB_SymType::Block
@ Block
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
llvm::LoopStandardAnalysisResults::TLI
TargetLibraryInfo & TLI
Definition: LoopAnalysisManager.h:58
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::PatternMatch::m_Shl
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1141
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:421
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
Definition: AliasAnalysis.cpp:218
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:209
llvm::RecurKind::Xor
@ Xor
Bitwise or logical XOR of integers.
SetVector.h
DisableMemmoveIdiom
static cl::opt< bool > DisableMemmoveIdiom("disable-memmove-idiom", cl::Hidden, cl::init(false), cl::desc("Disable generation of memmove in loop idiom recognition"))
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
SmallSet.h
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37