LLVM  10.0.0svn
ARMParallelDSP.cpp
Go to the documentation of this file.
1 //===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11 /// purpose of this pass is do some IR pattern matching to create ACLE
12 /// DSP intrinsics, which map on these 32-bit SIMD operations.
13 /// This pass runs only when unaligned accesses is supported/enabled.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/ADT/SmallPtrSet.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/NoFolder.h"
24 #include "llvm/Transforms/Scalar.h"
26 #include "llvm/Pass.h"
27 #include "llvm/PassRegistry.h"
28 #include "llvm/PassSupport.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/IR/PatternMatch.h"
32 #include "ARM.h"
33 #include "ARMSubtarget.h"
34 
35 using namespace llvm;
36 using namespace PatternMatch;
37 
38 #define DEBUG_TYPE "arm-parallel-dsp"
39 
40 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
41 
42 static cl::opt<bool>
43 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
44  cl::desc("Disable the ARM Parallel DSP pass"));
45 
46 static cl::opt<unsigned>
47 NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
48  cl::desc("Limit the number of loads analysed"));
49 
50 namespace {
51  struct MulCandidate;
52  class Reduction;
53 
54  using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
55  using MemInstList = SmallVectorImpl<LoadInst*>;
57 
58  // 'MulCandidate' holds the multiplication instructions that are candidates
59  // for parallel execution.
60  struct MulCandidate {
61  Instruction *Root;
62  Value* LHS;
63  Value* RHS;
64  bool Exchange = false;
65  bool ReadOnly = true;
66  bool Paired = false;
67  SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
68 
69  MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
70  Root(I), LHS(lhs), RHS(rhs) { }
71 
72  bool HasTwoLoadInputs() const {
73  return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
74  }
75 
76  LoadInst *getBaseLoad() const {
77  return VecLd.front();
78  }
79  };
80 
81  /// Represent a sequence of multiply-accumulate operations with the aim to
82  /// perform the multiplications in parallel.
83  class Reduction {
84  Instruction *Root = nullptr;
85  Value *Acc = nullptr;
86  MulCandList Muls;
87  MulPairList MulPairs;
89 
90  public:
91  Reduction() = delete;
92 
93  Reduction (Instruction *Add) : Root(Add) { }
94 
95  /// Record an Add instruction that is a part of the this reduction.
96  void InsertAdd(Instruction *I) { Adds.insert(I); }
97 
98  /// Create MulCandidates, each rooted at a Mul instruction, that is a part
99  /// of this reduction.
100  void InsertMuls() {
101  auto GetMulOperand = [](Value *V) -> Instruction* {
102  if (auto *SExt = dyn_cast<SExtInst>(V)) {
103  if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
104  if (I->getOpcode() == Instruction::Mul)
105  return I;
106  } else if (auto *I = dyn_cast<Instruction>(V)) {
107  if (I->getOpcode() == Instruction::Mul)
108  return I;
109  }
110  return nullptr;
111  };
112 
113  auto InsertMul = [this](Instruction *I) {
114  Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
115  Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
116  Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
117  };
118 
119  for (auto *Add : Adds) {
120  if (Add == Acc)
121  continue;
122  if (auto *Mul = GetMulOperand(Add->getOperand(0)))
123  InsertMul(Mul);
124  if (auto *Mul = GetMulOperand(Add->getOperand(1)))
125  InsertMul(Mul);
126  }
127  }
128 
129  /// Add the incoming accumulator value, returns true if a value had not
130  /// already been added. Returning false signals to the user that this
131  /// reduction already has a value to initialise the accumulator.
132  bool InsertAcc(Value *V) {
133  if (Acc)
134  return false;
135  Acc = V;
136  return true;
137  }
138 
139  /// Set two MulCandidates, rooted at muls, that can be executed as a single
140  /// parallel operation.
141  void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
142  bool Exchange = false) {
143  LLVM_DEBUG(dbgs() << "Pairing:\n"
144  << *Mul0->Root << "\n"
145  << *Mul1->Root << "\n");
146  Mul0->Paired = true;
147  Mul1->Paired = true;
148  if (Exchange)
149  Mul1->Exchange = true;
150  MulPairs.push_back(std::make_pair(Mul0, Mul1));
151  }
152 
153  /// Return true if enough mul operations are found that can be executed in
154  /// parallel.
155  bool CreateParallelPairs();
156 
157  /// Return the add instruction which is the root of the reduction.
158  Instruction *getRoot() { return Root; }
159 
160  bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
161 
162  Type *getType() const { return Root->getType(); }
163 
164  /// Return the incoming value to be accumulated. This maybe null.
165  Value *getAccumulator() { return Acc; }
166 
167  /// Return the set of adds that comprise the reduction.
168  SetVector<Instruction*> &getAdds() { return Adds; }
169 
170  /// Return the MulCandidate, rooted at mul instruction, that comprise the
171  /// the reduction.
172  MulCandList &getMuls() { return Muls; }
173 
174  /// Return the MulCandidate, rooted at mul instructions, that have been
175  /// paired for parallel execution.
176  MulPairList &getMulPairs() { return MulPairs; }
177 
178  /// To finalise, replace the uses of the root with the intrinsic call.
179  void UpdateRoot(Instruction *SMLAD) {
180  Root->replaceAllUsesWith(SMLAD);
181  }
182 
183  void dump() {
184  LLVM_DEBUG(dbgs() << "Reduction:\n";
185  for (auto *Add : Adds)
186  LLVM_DEBUG(dbgs() << *Add << "\n");
187  for (auto &Mul : Muls)
188  LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
189  << " " << *Mul->LHS << "\n"
190  << " " << *Mul->RHS << "\n");
191  LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
192  );
193  }
194  };
195 
196  class WidenedLoad {
197  LoadInst *NewLd = nullptr;
199 
200  public:
201  WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
202  : NewLd(Wide) {
203  for (auto *I : Lds)
204  Loads.push_back(I);
205  }
206  LoadInst *getLoad() {
207  return NewLd;
208  }
209  };
210 
211  class ARMParallelDSP : public FunctionPass {
212  ScalarEvolution *SE;
213  AliasAnalysis *AA;
214  TargetLibraryInfo *TLI;
215  DominatorTree *DT;
216  const DataLayout *DL;
217  Module *M;
218  std::map<LoadInst*, LoadInst*> LoadPairs;
219  SmallPtrSet<LoadInst*, 4> OffsetLoads;
220  std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
221 
222  template<unsigned>
223  bool IsNarrowSequence(Value *V);
224  bool Search(Value *V, BasicBlock *BB, Reduction &R);
225  bool RecordMemoryOps(BasicBlock *BB);
226  void InsertParallelMACs(Reduction &Reduction);
227  bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
228  LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
229  bool CreateParallelPairs(Reduction &R);
230 
231  /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
232  /// Dual performs two signed 16x16-bit multiplications. It adds the
233  /// products to a 32-bit accumulate operand. Optionally, the instruction can
234  /// exchange the halfwords of the second operand before performing the
235  /// arithmetic.
236  bool MatchSMLAD(Function &F);
237 
238  public:
239  static char ID;
240 
241  ARMParallelDSP() : FunctionPass(ID) { }
242 
243  void getAnalysisUsage(AnalysisUsage &AU) const override {
253  AU.setPreservesCFG();
254  }
255 
256  bool runOnFunction(Function &F) override {
257  if (DisableParallelDSP)
258  return false;
259  if (skipFunction(F))
260  return false;
261 
262  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
263  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
264  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
265  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
266  auto &TPC = getAnalysis<TargetPassConfig>();
267 
268  M = F.getParent();
269  DL = &M->getDataLayout();
270 
271  auto &TM = TPC.getTM<TargetMachine>();
272  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
273 
274  if (!ST->allowsUnalignedMem()) {
275  LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
276  "running pass ARMParallelDSP\n");
277  return false;
278  }
279 
280  if (!ST->hasDSP()) {
281  LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
282  "ARMParallelDSP\n");
283  return false;
284  }
285 
286  if (!ST->isLittle()) {
287  LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
288  << "ARMParallelDSP\n");
289  return false;
290  }
291 
292  LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
293  LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
294 
295  bool Changes = MatchSMLAD(F);
296  return Changes;
297  }
298  };
299 }
300 
301 template<typename MemInst>
302 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
303  const DataLayout &DL, ScalarEvolution &SE) {
304  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
305  return true;
306  return false;
307 }
308 
309 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
310  MemInstList &VecMem) {
311  if (!Ld0 || !Ld1)
312  return false;
313 
314  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
315  return false;
316 
317  LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
318  dbgs() << "Ld0:"; Ld0->dump();
319  dbgs() << "Ld1:"; Ld1->dump();
320  );
321 
322  VecMem.clear();
323  VecMem.push_back(Ld0);
324  VecMem.push_back(Ld1);
325  return true;
326 }
327 
328 // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
329 // instructions, which is set to 16. So here we should collect all i8 and i16
330 // narrow operations.
331 // TODO: we currently only collect i16, and will support i8 later, so that's
332 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
333 template<unsigned MaxBitWidth>
334 bool ARMParallelDSP::IsNarrowSequence(Value *V) {
335  if (auto *SExt = dyn_cast<SExtInst>(V)) {
336  if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
337  return false;
338 
339  if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
340  // Check that this load could be paired.
341  return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
342  }
343  }
344  return false;
345 }
346 
347 /// Iterate through the block and record base, offset pairs of loads which can
348 /// be widened into a single load.
349 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
352  LoadPairs.clear();
353  WideLoads.clear();
354  OrderedBasicBlock OrderedBB(BB);
355 
356  // Collect loads and instruction that may write to memory. For now we only
357  // record loads which are simple, sign-extended and have a single user.
358  // TODO: Allow zero-extended loads.
359  for (auto &I : *BB) {
360  if (I.mayWriteToMemory())
361  Writes.push_back(&I);
362  auto *Ld = dyn_cast<LoadInst>(&I);
363  if (!Ld || !Ld->isSimple() ||
364  !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
365  continue;
366  Loads.push_back(Ld);
367  }
368 
369  if (Loads.empty() || Loads.size() > NumLoadLimit)
370  return false;
371 
372  using InstSet = std::set<Instruction*>;
373  using DepMap = std::map<Instruction*, InstSet>;
374  DepMap RAWDeps;
375 
376  // Record any writes that may alias a load.
377  const auto Size = LocationSize::unknown();
378  for (auto Write : Writes) {
379  for (auto Read : Loads) {
380  MemoryLocation ReadLoc =
381  MemoryLocation(Read->getPointerOperand(), Size);
382 
383  if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
385  continue;
386  if (OrderedBB.dominates(Write, Read))
387  RAWDeps[Read].insert(Write);
388  }
389  }
390 
391  // Check whether there's not a write between the two loads which would
392  // prevent them from being safely merged.
393  auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
394  LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset;
395  LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;
396 
397  if (RAWDeps.count(Dominated)) {
398  InstSet &WritesBefore = RAWDeps[Dominated];
399 
400  for (auto Before : WritesBefore) {
401  // We can't move the second load backward, past a write, to merge
402  // with the first load.
403  if (OrderedBB.dominates(Dominator, Before))
404  return false;
405  }
406  }
407  return true;
408  };
409 
410  // Record base, offset load pairs.
411  for (auto *Base : Loads) {
412  for (auto *Offset : Loads) {
413  if (Base == Offset || OffsetLoads.count(Offset))
414  continue;
415 
416  if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
417  SafeToPair(Base, Offset)) {
418  LoadPairs[Base] = Offset;
419  OffsetLoads.insert(Offset);
420  break;
421  }
422  }
423  }
424 
425  LLVM_DEBUG(if (!LoadPairs.empty()) {
426  dbgs() << "Consecutive load pairs:\n";
427  for (auto &MapIt : LoadPairs) {
428  LLVM_DEBUG(dbgs() << *MapIt.first << ", "
429  << *MapIt.second << "\n");
430  }
431  });
432  return LoadPairs.size() > 1;
433 }
434 
435 // Search recursively back through the operands to find a tree of values that
436 // form a multiply-accumulate chain. The search records the Add and Mul
437 // instructions that form the reduction and allows us to find a single value
438 // to be used as the initial input to the accumlator.
439 bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
440  // If we find a non-instruction, try to use it as the initial accumulator
441  // value. This may have already been found during the search in which case
442  // this function will return false, signaling a search fail.
443  auto *I = dyn_cast<Instruction>(V);
444  if (!I)
445  return R.InsertAcc(V);
446 
447  if (I->getParent() != BB)
448  return false;
449 
450  switch (I->getOpcode()) {
451  default:
452  break;
453  case Instruction::PHI:
454  // Could be the accumulator value.
455  return R.InsertAcc(V);
456  case Instruction::Add: {
457  // Adds should be adding together two muls, or another add and a mul to
458  // be within the mac chain. One of the operands may also be the
459  // accumulator value at which point we should stop searching.
460  R.InsertAdd(I);
461  Value *LHS = I->getOperand(0);
462  Value *RHS = I->getOperand(1);
463  bool ValidLHS = Search(LHS, BB, R);
464  bool ValidRHS = Search(RHS, BB, R);
465 
466  if (ValidLHS && ValidRHS)
467  return true;
468 
469  return R.InsertAcc(I);
470  }
471  case Instruction::Mul: {
472  Value *MulOp0 = I->getOperand(0);
473  Value *MulOp1 = I->getOperand(1);
474  return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
475  }
476  case Instruction::SExt:
477  return Search(I->getOperand(0), BB, R);
478  }
479  return false;
480 }
481 
482 // The pass needs to identify integer add/sub reductions of 16-bit vector
483 // multiplications.
484 // To use SMLAD:
485 // 1) we first need to find integer add then look for this pattern:
486 //
487 // acc0 = ...
488 // ld0 = load i16
489 // sext0 = sext i16 %ld0 to i32
490 // ld1 = load i16
491 // sext1 = sext i16 %ld1 to i32
492 // mul0 = mul %sext0, %sext1
493 // ld2 = load i16
494 // sext2 = sext i16 %ld2 to i32
495 // ld3 = load i16
496 // sext3 = sext i16 %ld3 to i32
497 // mul1 = mul i32 %sext2, %sext3
498 // add0 = add i32 %mul0, %acc0
499 // acc1 = add i32 %add0, %mul1
500 //
501 // Which can be selected to:
502 //
503 // ldr r0
504 // ldr r1
505 // smlad r2, r0, r1, r2
506 //
507 // If constants are used instead of loads, these will need to be hoisted
508 // out and into a register.
509 //
510 // If loop invariants are used instead of loads, these need to be packed
511 // before the loop begins.
512 //
513 bool ARMParallelDSP::MatchSMLAD(Function &F) {
514  bool Changed = false;
515 
516  for (auto &BB : F) {
518  if (!RecordMemoryOps(&BB))
519  continue;
520 
521  for (Instruction &I : reverse(BB)) {
522  if (I.getOpcode() != Instruction::Add)
523  continue;
524 
525  if (AllAdds.count(&I))
526  continue;
527 
528  const auto *Ty = I.getType();
529  if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
530  continue;
531 
532  Reduction R(&I);
533  if (!Search(&I, &BB, R))
534  continue;
535 
536  R.InsertMuls();
537  LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
538 
539  if (!CreateParallelPairs(R))
540  continue;
541 
542  InsertParallelMACs(R);
543  Changed = true;
544  AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
545  }
546  }
547 
548  return Changed;
549 }
550 
551 bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
552 
553  // Not enough mul operations to make a pair.
554  if (R.getMuls().size() < 2)
555  return false;
556 
557  // Check that the muls operate directly upon sign extended loads.
558  for (auto &MulCand : R.getMuls()) {
559  if (!MulCand->HasTwoLoadInputs())
560  return false;
561  }
562 
563  auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
564  // The first elements of each vector should be loads with sexts. If we
565  // find that its two pairs of consecutive loads, then these can be
566  // transformed into two wider loads and the users can be replaced with
567  // DSP intrinsics.
568  auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
569  auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
570  auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
571  auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
572 
573  if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
574  if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
575  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
576  R.AddMulPair(PMul0, PMul1);
577  return true;
578  } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
579  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
580  LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
581  R.AddMulPair(PMul0, PMul1, true);
582  return true;
583  }
584  } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
585  AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
586  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
587  LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
588  LLVM_DEBUG(dbgs() << " and swapping muls\n");
589  // Only the second operand can be exchanged, so swap the muls.
590  R.AddMulPair(PMul1, PMul0, true);
591  return true;
592  }
593  return false;
594  };
595 
596  MulCandList &Muls = R.getMuls();
597  const unsigned Elems = Muls.size();
598  for (unsigned i = 0; i < Elems; ++i) {
599  MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
600  if (PMul0->Paired)
601  continue;
602 
603  for (unsigned j = 0; j < Elems; ++j) {
604  if (i == j)
605  continue;
606 
607  MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
608  if (PMul1->Paired)
609  continue;
610 
611  const Instruction *Mul0 = PMul0->Root;
612  const Instruction *Mul1 = PMul1->Root;
613  if (Mul0 == Mul1)
614  continue;
615 
616  assert(PMul0 != PMul1 && "expected different chains");
617 
618  if (CanPair(R, PMul0, PMul1))
619  break;
620  }
621  }
622  return !R.getMulPairs().empty();
623 }
624 
625 void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
626 
627  auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
628  Value *Acc, bool Exchange,
629  Instruction *InsertAfter) {
630  // Replace the reduction chain with an intrinsic call
631 
632  Value* Args[] = { WideLd0, WideLd1, Acc };
633  Function *SMLAD = nullptr;
634  if (Exchange)
635  SMLAD = Acc->getType()->isIntegerTy(32) ?
636  Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
637  Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
638  else
639  SMLAD = Acc->getType()->isIntegerTy(32) ?
640  Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
641  Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
642 
643  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
644  BasicBlock::iterator(InsertAfter));
645  Instruction *Call = Builder.CreateCall(SMLAD, Args);
646  NumSMLAD++;
647  return Call;
648  };
649 
650  // Return the instruction after the dominated instruction.
651  auto GetInsertPoint = [this](Value *A, Value *B) {
652  assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
653  "expected at least one instruction");
654 
655  Value *V = nullptr;
656  if (!isa<Instruction>(A))
657  V = B;
658  else if (!isa<Instruction>(B))
659  V = A;
660  else
661  V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
662 
663  return &*++BasicBlock::iterator(cast<Instruction>(V));
664  };
665 
666  Value *Acc = R.getAccumulator();
667 
668  // For any muls that were discovered but not paired, accumulate their values
669  // as before.
670  IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
671  MulCandList &MulCands = R.getMuls();
672  for (auto &MulCand : MulCands) {
673  if (MulCand->Paired)
674  continue;
675 
676  Instruction *Mul = cast<Instruction>(MulCand->Root);
677  LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
678 
679  if (R.getType() != Mul->getType()) {
680  assert(R.is64Bit() && "expected 64-bit result");
681  Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
682  Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
683  }
684 
685  if (!Acc) {
686  Acc = Mul;
687  continue;
688  }
689 
690  // If Acc is the original incoming value to the reduction, it could be a
691  // phi. But the phi will dominate Mul, meaning that Mul will be the
692  // insertion point.
693  Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
694  Acc = Builder.CreateAdd(Mul, Acc);
695  }
696 
697  if (!Acc) {
698  Acc = R.is64Bit() ?
699  ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
700  ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
701  } else if (Acc->getType() != R.getType()) {
702  Builder.SetInsertPoint(R.getRoot());
703  Acc = Builder.CreateSExt(Acc, R.getType());
704  }
705 
706  // Roughly sort the mul pairs in their program order.
707  OrderedBasicBlock OrderedBB(R.getRoot()->getParent());
708  llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) {
709  const Instruction *A = PairA.first->Root;
710  const Instruction *B = PairB.first->Root;
711  return OrderedBB.dominates(A, B);
712  });
713 
714  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
715  for (auto &Pair : R.getMulPairs()) {
716  MulCandidate *LHSMul = Pair.first;
717  MulCandidate *RHSMul = Pair.second;
718  LoadInst *BaseLHS = LHSMul->getBaseLoad();
719  LoadInst *BaseRHS = RHSMul->getBaseLoad();
720  LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
721  WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
722  LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
723  WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
724 
725  Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
726  InsertAfter = GetInsertPoint(InsertAfter, Acc);
727  Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
728  }
729  R.UpdateRoot(cast<Instruction>(Acc));
730 }
731 
732 LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
733  IntegerType *LoadTy) {
734  assert(Loads.size() == 2 && "currently only support widening two loads");
735 
736  LoadInst *Base = Loads[0];
737  LoadInst *Offset = Loads[1];
738 
739  Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
740  Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
741 
742  assert((BaseSExt && OffsetSExt)
743  && "Loads should have a single, extending, user");
744 
745  std::function<void(Value*, Value*)> MoveBefore =
746  [&](Value *A, Value *B) -> void {
747  if (!isa<Instruction>(A) || !isa<Instruction>(B))
748  return;
749 
750  auto *Source = cast<Instruction>(A);
751  auto *Sink = cast<Instruction>(B);
752 
753  if (DT->dominates(Source, Sink) ||
754  Source->getParent() != Sink->getParent() ||
755  isa<PHINode>(Source) || isa<PHINode>(Sink))
756  return;
757 
758  Source->moveBefore(Sink);
759  for (auto &Op : Source->operands())
760  MoveBefore(Op, Source);
761  };
762 
763  // Insert the load at the point of the original dominating load.
764  LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
765  IRBuilder<NoFolder> IRB(DomLoad->getParent(),
766  ++BasicBlock::iterator(DomLoad));
767 
768  // Bitcast the pointer to a wider type and create the wide load, while making
769  // sure to maintain the original alignment as this prevents ldrd from being
770  // generated when it could be illegal due to memory alignment.
771  const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
772  Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
773  LoadTy->getPointerTo(AddrSpace));
774  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
775  Base->getAlignment());
776 
777  // Make sure everything is in the correct order in the basic block.
778  MoveBefore(Base->getPointerOperand(), VecPtr);
779  MoveBefore(VecPtr, WideLoad);
780 
781  // From the wide load, create two values that equal the original two loads.
782  // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
783  // TODO: Support big-endian as well.
784  Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
785  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
786  BaseSExt->replaceAllUsesWith(NewBaseSExt);
787 
788  IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
789  Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
790  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
791  Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
792  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
793  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
794 
795  LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
796  << *Base << "\n" << *Offset << "\n"
797  << "Created Wide Load:\n"
798  << *WideLoad << "\n"
799  << *Bottom << "\n"
800  << *NewBaseSExt << "\n"
801  << *Top << "\n"
802  << *Trunc << "\n"
803  << *NewOffsetSExt << "\n");
804  WideLoads.emplace(std::make_pair(Base,
805  std::make_unique<WidenedLoad>(Loads, WideLoad)));
806  return WideLoad;
807 }
808 
810  return new ARMParallelDSP();
811 }
812 
813 char ARMParallelDSP::ID = 0;
814 
815 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
816  "Transform functions to use DSP intrinsics", false, false)
817 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
818  "Transform functions to use DSP intrinsics", false, false)
Legacy wrapper pass to provide the GlobalsAAResult object.
The access may reference and may modify the value stored in memory.
Pass interface - Implemented by all &#39;passes&#39;.
Definition: Pass.h:80
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:111
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP
This class represents lattice values for constants.
Definition: AllocatorList.h:23
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
A Module instance is used to store all the information related to an LLVM module. ...
Definition: Module.h:66
static constexpr LocationSize unknown()
arm parallel dsp
The main scalar evolution driver.
static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, const DataLayout &DL, ScalarEvolution &SE)
bool dominates(const Instruction *A, const Instruction *B)
Find out whether A dominates B, meaning whether A comes before B in BB.
An immutable pass that tracks lazily created AssumptionCache objects.
bool mayWriteToMemory() const
Return true if this instruction may modify memory.
static cl::opt< unsigned > NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed"))
STATISTIC(NumFunctions, "Total number of functions")
F(f)
This class represents a sign extension of integer types.
An instruction for reading from memory.
Definition: Instructions.h:169
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4429
AnalysisUsage & addRequired()
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:659
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:197
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:779
Target-Independent Code Generator Pass Configuration Options.
auto reverse(ContainerTy &&C, typename std::enable_if< has_rbegin< ContainerTy >::value >::type *=nullptr) -> decltype(make_range(C.rbegin(), C.rend()))
Definition: STLExtras.h:261
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:96
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:125
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:429
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:71
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree...
Definition: Dominators.h:144
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1093
Value * getOperand(unsigned i) const
Definition: User.h:169
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
static bool runOnFunction(Function &F, bool PostInlining)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Pass * createARMParallelDSPPass()
Move duplicate certain instructions close to their use
Definition: Localizer.cpp:27
LLVM Basic Block Representation.
Definition: BasicBlock.h:57
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
static bool is64Bit(const char *name)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
arm parallel Transform functions to use DSP intrinsics
Represent the analysis usage information of a pass.
amdgpu propagate attributes Late propagate attributes from kernels to functions
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
Class to represent integer types.
Definition: DerivedTypes.h:40
size_t size() const
Definition: SmallVector.h:52
static wasm::ValType getType(const TargetRegisterClass *RC)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1095
Representation for a specific memory location.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:244
Iterator for intrusive lists based on ilist_node.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool dominates(const Instruction *Def, const Use &U) const
Return true if Def dominates a use in User.
Definition: Dominators.cpp:248
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:63
Provides information about what library functions are available for the current target.
static Constant * get(Type *Ty, uint64_t V, bool isSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:653
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:89
loop Loop Strength Reduction
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
StringRef getName() const
Return a constant reference to the value&#39;s name.
Definition: Value.cpp:214
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
#define I(x, y, z)
Definition: MD5.cpp:58
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:332
uint32_t Size
Definition: Profile.cpp:46
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:295
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:575
LLVM Value Representation.
Definition: Value.h:74
A vector that has set insertion semantics.
Definition: SetVector.h:40
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:65
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:259
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
#define LLVM_DEBUG(X)
Definition: Debug.h:122
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:277
const BasicBlock * getParent() const
Definition: Instruction.h:66