LLVM  14.0.0git
ARMParallelDSP.cpp
Go to the documentation of this file.
1 //===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11 /// purpose of this pass is do some IR pattern matching to create ACLE
12 /// DSP intrinsics, which map on these 32-bit SIMD operations.
13 /// This pass runs only when unaligned accesses is supported/enabled.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "ARM.h"
18 #include "ARMSubtarget.h"
19 #include "llvm/ADT/SmallPtrSet.h"
20 #include "llvm/ADT/Statistic.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/IntrinsicsARM.h"
29 #include "llvm/IR/NoFolder.h"
30 #include "llvm/IR/PatternMatch.h"
31 #include "llvm/Pass.h"
32 #include "llvm/PassRegistry.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Transforms/Scalar.h"
36 
37 using namespace llvm;
38 using namespace PatternMatch;
39 
40 #define DEBUG_TYPE "arm-parallel-dsp"
41 
42 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
43 
44 static cl::opt<bool>
45 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
46  cl::desc("Disable the ARM Parallel DSP pass"));
47 
48 static cl::opt<unsigned>
49 NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
50  cl::desc("Limit the number of loads analysed"));
51 
52 namespace {
53  struct MulCandidate;
54  class Reduction;
55 
56  using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
57  using MemInstList = SmallVectorImpl<LoadInst*>;
59 
60  // 'MulCandidate' holds the multiplication instructions that are candidates
61  // for parallel execution.
62  struct MulCandidate {
63  Instruction *Root;
64  Value* LHS;
65  Value* RHS;
66  bool Exchange = false;
67  bool ReadOnly = true;
68  bool Paired = false;
69  SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
70 
71  MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
72  Root(I), LHS(lhs), RHS(rhs) { }
73 
74  bool HasTwoLoadInputs() const {
75  return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
76  }
77 
78  LoadInst *getBaseLoad() const {
79  return VecLd.front();
80  }
81  };
82 
83  /// Represent a sequence of multiply-accumulate operations with the aim to
84  /// perform the multiplications in parallel.
85  class Reduction {
86  Instruction *Root = nullptr;
87  Value *Acc = nullptr;
88  MulCandList Muls;
89  MulPairList MulPairs;
91 
92  public:
93  Reduction() = delete;
94 
95  Reduction (Instruction *Add) : Root(Add) { }
96 
97  /// Record an Add instruction that is a part of the this reduction.
98  void InsertAdd(Instruction *I) { Adds.insert(I); }
99 
100  /// Create MulCandidates, each rooted at a Mul instruction, that is a part
101  /// of this reduction.
102  void InsertMuls() {
103  auto GetMulOperand = [](Value *V) -> Instruction* {
104  if (auto *SExt = dyn_cast<SExtInst>(V)) {
105  if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
106  if (I->getOpcode() == Instruction::Mul)
107  return I;
108  } else if (auto *I = dyn_cast<Instruction>(V)) {
109  if (I->getOpcode() == Instruction::Mul)
110  return I;
111  }
112  return nullptr;
113  };
114 
115  auto InsertMul = [this](Instruction *I) {
116  Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
117  Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
118  Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
119  };
120 
121  for (auto *Add : Adds) {
122  if (Add == Acc)
123  continue;
124  if (auto *Mul = GetMulOperand(Add->getOperand(0)))
125  InsertMul(Mul);
126  if (auto *Mul = GetMulOperand(Add->getOperand(1)))
127  InsertMul(Mul);
128  }
129  }
130 
131  /// Add the incoming accumulator value, returns true if a value had not
132  /// already been added. Returning false signals to the user that this
133  /// reduction already has a value to initialise the accumulator.
134  bool InsertAcc(Value *V) {
135  if (Acc)
136  return false;
137  Acc = V;
138  return true;
139  }
140 
141  /// Set two MulCandidates, rooted at muls, that can be executed as a single
142  /// parallel operation.
143  void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
144  bool Exchange = false) {
145  LLVM_DEBUG(dbgs() << "Pairing:\n"
146  << *Mul0->Root << "\n"
147  << *Mul1->Root << "\n");
148  Mul0->Paired = true;
149  Mul1->Paired = true;
150  if (Exchange)
151  Mul1->Exchange = true;
152  MulPairs.push_back(std::make_pair(Mul0, Mul1));
153  }
154 
155  /// Return true if enough mul operations are found that can be executed in
156  /// parallel.
157  bool CreateParallelPairs();
158 
159  /// Return the add instruction which is the root of the reduction.
160  Instruction *getRoot() { return Root; }
161 
162  bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
163 
164  Type *getType() const { return Root->getType(); }
165 
166  /// Return the incoming value to be accumulated. This maybe null.
167  Value *getAccumulator() { return Acc; }
168 
169  /// Return the set of adds that comprise the reduction.
170  SetVector<Instruction*> &getAdds() { return Adds; }
171 
172  /// Return the MulCandidate, rooted at mul instruction, that comprise the
173  /// the reduction.
174  MulCandList &getMuls() { return Muls; }
175 
176  /// Return the MulCandidate, rooted at mul instructions, that have been
177  /// paired for parallel execution.
178  MulPairList &getMulPairs() { return MulPairs; }
179 
180  /// To finalise, replace the uses of the root with the intrinsic call.
181  void UpdateRoot(Instruction *SMLAD) {
182  Root->replaceAllUsesWith(SMLAD);
183  }
184 
185  void dump() {
186  LLVM_DEBUG(dbgs() << "Reduction:\n";
187  for (auto *Add : Adds)
188  LLVM_DEBUG(dbgs() << *Add << "\n");
189  for (auto &Mul : Muls)
190  LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
191  << " " << *Mul->LHS << "\n"
192  << " " << *Mul->RHS << "\n");
193  LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
194  );
195  }
196  };
197 
198  class WidenedLoad {
199  LoadInst *NewLd = nullptr;
201 
202  public:
203  WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
204  : NewLd(Wide) {
205  append_range(Loads, Lds);
206  }
207  LoadInst *getLoad() {
208  return NewLd;
209  }
210  };
211 
212  class ARMParallelDSP : public FunctionPass {
213  ScalarEvolution *SE;
214  AliasAnalysis *AA;
215  TargetLibraryInfo *TLI;
216  DominatorTree *DT;
217  const DataLayout *DL;
218  Module *M;
219  std::map<LoadInst*, LoadInst*> LoadPairs;
220  SmallPtrSet<LoadInst*, 4> OffsetLoads;
221  std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
222 
223  template<unsigned>
224  bool IsNarrowSequence(Value *V);
225  bool Search(Value *V, BasicBlock *BB, Reduction &R);
226  bool RecordMemoryOps(BasicBlock *BB);
227  void InsertParallelMACs(Reduction &Reduction);
228  bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
229  LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
230  bool CreateParallelPairs(Reduction &R);
231 
232  /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
233  /// Dual performs two signed 16x16-bit multiplications. It adds the
234  /// products to a 32-bit accumulate operand. Optionally, the instruction can
235  /// exchange the halfwords of the second operand before performing the
236  /// arithmetic.
237  bool MatchSMLAD(Function &F);
238 
239  public:
240  static char ID;
241 
242  ARMParallelDSP() : FunctionPass(ID) { }
243 
244  void getAnalysisUsage(AnalysisUsage &AU) const override {
254  AU.setPreservesCFG();
255  }
256 
257  bool runOnFunction(Function &F) override {
258  if (DisableParallelDSP)
259  return false;
260  if (skipFunction(F))
261  return false;
262 
263  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
264  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
265  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
266  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
267  auto &TPC = getAnalysis<TargetPassConfig>();
268 
269  M = F.getParent();
270  DL = &M->getDataLayout();
271 
272  auto &TM = TPC.getTM<TargetMachine>();
273  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
274 
275  if (!ST->allowsUnalignedMem()) {
276  LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
277  "running pass ARMParallelDSP\n");
278  return false;
279  }
280 
281  if (!ST->hasDSP()) {
282  LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
283  "ARMParallelDSP\n");
284  return false;
285  }
286 
287  if (!ST->isLittle()) {
288  LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
289  << "ARMParallelDSP\n");
290  return false;
291  }
292 
293  LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
294  LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
295 
296  bool Changes = MatchSMLAD(F);
297  return Changes;
298  }
299  };
300 }
301 
302 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
303  MemInstList &VecMem) {
304  if (!Ld0 || !Ld1)
305  return false;
306 
307  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
308  return false;
309 
310  LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
311  dbgs() << "Ld0:"; Ld0->dump();
312  dbgs() << "Ld1:"; Ld1->dump();
313  );
314 
315  VecMem.clear();
316  VecMem.push_back(Ld0);
317  VecMem.push_back(Ld1);
318  return true;
319 }
320 
321 // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
322 // instructions, which is set to 16. So here we should collect all i8 and i16
323 // narrow operations.
324 // TODO: we currently only collect i16, and will support i8 later, so that's
325 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
326 template<unsigned MaxBitWidth>
327 bool ARMParallelDSP::IsNarrowSequence(Value *V) {
328  if (auto *SExt = dyn_cast<SExtInst>(V)) {
329  if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
330  return false;
331 
332  if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
333  // Check that this load could be paired.
334  return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
335  }
336  }
337  return false;
338 }
339 
340 /// Iterate through the block and record base, offset pairs of loads which can
341 /// be widened into a single load.
342 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
345  LoadPairs.clear();
346  WideLoads.clear();
347 
348  // Collect loads and instruction that may write to memory. For now we only
349  // record loads which are simple, sign-extended and have a single user.
350  // TODO: Allow zero-extended loads.
351  for (auto &I : *BB) {
352  if (I.mayWriteToMemory())
353  Writes.push_back(&I);
354  auto *Ld = dyn_cast<LoadInst>(&I);
355  if (!Ld || !Ld->isSimple() ||
356  !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
357  continue;
358  Loads.push_back(Ld);
359  }
360 
361  if (Loads.empty() || Loads.size() > NumLoadLimit)
362  return false;
363 
364  using InstSet = std::set<Instruction*>;
365  using DepMap = std::map<Instruction*, InstSet>;
366  DepMap RAWDeps;
367 
368  // Record any writes that may alias a load.
370  for (auto Write : Writes) {
371  for (auto Read : Loads) {
372  MemoryLocation ReadLoc =
373  MemoryLocation(Read->getPointerOperand(), Size);
374 
375  if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
377  continue;
378  if (Write->comesBefore(Read))
379  RAWDeps[Read].insert(Write);
380  }
381  }
382 
383  // Check whether there's not a write between the two loads which would
384  // prevent them from being safely merged.
385  auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
386  bool BaseFirst = Base->comesBefore(Offset);
387  LoadInst *Dominator = BaseFirst ? Base : Offset;
388  LoadInst *Dominated = BaseFirst ? Offset : Base;
389 
390  if (RAWDeps.count(Dominated)) {
391  InstSet &WritesBefore = RAWDeps[Dominated];
392 
393  for (auto Before : WritesBefore) {
394  // We can't move the second load backward, past a write, to merge
395  // with the first load.
396  if (Dominator->comesBefore(Before))
397  return false;
398  }
399  }
400  return true;
401  };
402 
403  // Record base, offset load pairs.
404  for (auto *Base : Loads) {
405  for (auto *Offset : Loads) {
406  if (Base == Offset || OffsetLoads.count(Offset))
407  continue;
408 
409  if (isConsecutiveAccess(Base, Offset, *DL, *SE) &&
410  SafeToPair(Base, Offset)) {
411  LoadPairs[Base] = Offset;
412  OffsetLoads.insert(Offset);
413  break;
414  }
415  }
416  }
417 
418  LLVM_DEBUG(if (!LoadPairs.empty()) {
419  dbgs() << "Consecutive load pairs:\n";
420  for (auto &MapIt : LoadPairs) {
421  LLVM_DEBUG(dbgs() << *MapIt.first << ", "
422  << *MapIt.second << "\n");
423  }
424  });
425  return LoadPairs.size() > 1;
426 }
427 
428 // Search recursively back through the operands to find a tree of values that
429 // form a multiply-accumulate chain. The search records the Add and Mul
430 // instructions that form the reduction and allows us to find a single value
431 // to be used as the initial input to the accumlator.
432 bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
433  // If we find a non-instruction, try to use it as the initial accumulator
434  // value. This may have already been found during the search in which case
435  // this function will return false, signaling a search fail.
436  auto *I = dyn_cast<Instruction>(V);
437  if (!I)
438  return R.InsertAcc(V);
439 
440  if (I->getParent() != BB)
441  return false;
442 
443  switch (I->getOpcode()) {
444  default:
445  break;
446  case Instruction::PHI:
447  // Could be the accumulator value.
448  return R.InsertAcc(V);
449  case Instruction::Add: {
450  // Adds should be adding together two muls, or another add and a mul to
451  // be within the mac chain. One of the operands may also be the
452  // accumulator value at which point we should stop searching.
453  R.InsertAdd(I);
454  Value *LHS = I->getOperand(0);
455  Value *RHS = I->getOperand(1);
456  bool ValidLHS = Search(LHS, BB, R);
457  bool ValidRHS = Search(RHS, BB, R);
458 
459  if (ValidLHS && ValidRHS)
460  return true;
461 
462  return R.InsertAcc(I);
463  }
464  case Instruction::Mul: {
465  Value *MulOp0 = I->getOperand(0);
466  Value *MulOp1 = I->getOperand(1);
467  return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
468  }
469  case Instruction::SExt:
470  return Search(I->getOperand(0), BB, R);
471  }
472  return false;
473 }
474 
475 // The pass needs to identify integer add/sub reductions of 16-bit vector
476 // multiplications.
477 // To use SMLAD:
478 // 1) we first need to find integer add then look for this pattern:
479 //
480 // acc0 = ...
481 // ld0 = load i16
482 // sext0 = sext i16 %ld0 to i32
483 // ld1 = load i16
484 // sext1 = sext i16 %ld1 to i32
485 // mul0 = mul %sext0, %sext1
486 // ld2 = load i16
487 // sext2 = sext i16 %ld2 to i32
488 // ld3 = load i16
489 // sext3 = sext i16 %ld3 to i32
490 // mul1 = mul i32 %sext2, %sext3
491 // add0 = add i32 %mul0, %acc0
492 // acc1 = add i32 %add0, %mul1
493 //
494 // Which can be selected to:
495 //
496 // ldr r0
497 // ldr r1
498 // smlad r2, r0, r1, r2
499 //
500 // If constants are used instead of loads, these will need to be hoisted
501 // out and into a register.
502 //
503 // If loop invariants are used instead of loads, these need to be packed
504 // before the loop begins.
505 //
506 bool ARMParallelDSP::MatchSMLAD(Function &F) {
507  bool Changed = false;
508 
509  for (auto &BB : F) {
511  if (!RecordMemoryOps(&BB))
512  continue;
513 
514  for (Instruction &I : reverse(BB)) {
515  if (I.getOpcode() != Instruction::Add)
516  continue;
517 
518  if (AllAdds.count(&I))
519  continue;
520 
521  const auto *Ty = I.getType();
522  if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
523  continue;
524 
525  Reduction R(&I);
526  if (!Search(&I, &BB, R))
527  continue;
528 
529  R.InsertMuls();
530  LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
531 
532  if (!CreateParallelPairs(R))
533  continue;
534 
535  InsertParallelMACs(R);
536  Changed = true;
537  AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
538  }
539  }
540 
541  return Changed;
542 }
543 
544 bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
545 
546  // Not enough mul operations to make a pair.
547  if (R.getMuls().size() < 2)
548  return false;
549 
550  // Check that the muls operate directly upon sign extended loads.
551  for (auto &MulCand : R.getMuls()) {
552  if (!MulCand->HasTwoLoadInputs())
553  return false;
554  }
555 
556  auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
557  // The first elements of each vector should be loads with sexts. If we
558  // find that its two pairs of consecutive loads, then these can be
559  // transformed into two wider loads and the users can be replaced with
560  // DSP intrinsics.
561  auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
562  auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
563  auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
564  auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
565 
566  // Check that each mul is operating on two different loads.
567  if (Ld0 == Ld2 || Ld1 == Ld3)
568  return false;
569 
570  if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
571  if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
572  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
573  R.AddMulPair(PMul0, PMul1);
574  return true;
575  } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
576  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
577  LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
578  R.AddMulPair(PMul0, PMul1, true);
579  return true;
580  }
581  } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
582  AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
583  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
584  LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
585  LLVM_DEBUG(dbgs() << " and swapping muls\n");
586  // Only the second operand can be exchanged, so swap the muls.
587  R.AddMulPair(PMul1, PMul0, true);
588  return true;
589  }
590  return false;
591  };
592 
593  MulCandList &Muls = R.getMuls();
594  const unsigned Elems = Muls.size();
595  for (unsigned i = 0; i < Elems; ++i) {
596  MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
597  if (PMul0->Paired)
598  continue;
599 
600  for (unsigned j = 0; j < Elems; ++j) {
601  if (i == j)
602  continue;
603 
604  MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
605  if (PMul1->Paired)
606  continue;
607 
608  const Instruction *Mul0 = PMul0->Root;
609  const Instruction *Mul1 = PMul1->Root;
610  if (Mul0 == Mul1)
611  continue;
612 
613  assert(PMul0 != PMul1 && "expected different chains");
614 
615  if (CanPair(R, PMul0, PMul1))
616  break;
617  }
618  }
619  return !R.getMulPairs().empty();
620 }
621 
622 void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
623 
624  auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
625  Value *Acc, bool Exchange,
626  Instruction *InsertAfter) {
627  // Replace the reduction chain with an intrinsic call
628 
629  Value* Args[] = { WideLd0, WideLd1, Acc };
630  Function *SMLAD = nullptr;
631  if (Exchange)
632  SMLAD = Acc->getType()->isIntegerTy(32) ?
633  Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
634  Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
635  else
636  SMLAD = Acc->getType()->isIntegerTy(32) ?
637  Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
638  Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
639 
640  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
641  BasicBlock::iterator(InsertAfter));
642  Instruction *Call = Builder.CreateCall(SMLAD, Args);
643  NumSMLAD++;
644  return Call;
645  };
646 
647  // Return the instruction after the dominated instruction.
648  auto GetInsertPoint = [this](Value *A, Value *B) {
649  assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
650  "expected at least one instruction");
651 
652  Value *V = nullptr;
653  if (!isa<Instruction>(A))
654  V = B;
655  else if (!isa<Instruction>(B))
656  V = A;
657  else
658  V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
659 
660  return &*++BasicBlock::iterator(cast<Instruction>(V));
661  };
662 
663  Value *Acc = R.getAccumulator();
664 
665  // For any muls that were discovered but not paired, accumulate their values
666  // as before.
667  IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
668  MulCandList &MulCands = R.getMuls();
669  for (auto &MulCand : MulCands) {
670  if (MulCand->Paired)
671  continue;
672 
673  Instruction *Mul = cast<Instruction>(MulCand->Root);
674  LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
675 
676  if (R.getType() != Mul->getType()) {
677  assert(R.is64Bit() && "expected 64-bit result");
678  Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
679  Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
680  }
681 
682  if (!Acc) {
683  Acc = Mul;
684  continue;
685  }
686 
687  // If Acc is the original incoming value to the reduction, it could be a
688  // phi. But the phi will dominate Mul, meaning that Mul will be the
689  // insertion point.
690  Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
691  Acc = Builder.CreateAdd(Mul, Acc);
692  }
693 
694  if (!Acc) {
695  Acc = R.is64Bit() ?
696  ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
697  ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
698  } else if (Acc->getType() != R.getType()) {
699  Builder.SetInsertPoint(R.getRoot());
700  Acc = Builder.CreateSExt(Acc, R.getType());
701  }
702 
703  // Roughly sort the mul pairs in their program order.
704  llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
705  const Instruction *A = PairA.first->Root;
706  const Instruction *B = PairB.first->Root;
707  return A->comesBefore(B);
708  });
709 
710  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
711  for (auto &Pair : R.getMulPairs()) {
712  MulCandidate *LHSMul = Pair.first;
713  MulCandidate *RHSMul = Pair.second;
714  LoadInst *BaseLHS = LHSMul->getBaseLoad();
715  LoadInst *BaseRHS = RHSMul->getBaseLoad();
716  LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
717  WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
718  LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
719  WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
720 
721  Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
722  InsertAfter = GetInsertPoint(InsertAfter, Acc);
723  Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
724  }
725  R.UpdateRoot(cast<Instruction>(Acc));
726 }
727 
728 LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
729  IntegerType *LoadTy) {
730  assert(Loads.size() == 2 && "currently only support widening two loads");
731 
732  LoadInst *Base = Loads[0];
733  LoadInst *Offset = Loads[1];
734 
735  Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
736  Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
737 
738  assert((BaseSExt && OffsetSExt)
739  && "Loads should have a single, extending, user");
740 
741  std::function<void(Value*, Value*)> MoveBefore =
742  [&](Value *A, Value *B) -> void {
743  if (!isa<Instruction>(A) || !isa<Instruction>(B))
744  return;
745 
746  auto *Source = cast<Instruction>(A);
747  auto *Sink = cast<Instruction>(B);
748 
749  if (DT->dominates(Source, Sink) ||
750  Source->getParent() != Sink->getParent() ||
751  isa<PHINode>(Source) || isa<PHINode>(Sink))
752  return;
753 
754  Source->moveBefore(Sink);
755  for (auto &Op : Source->operands())
756  MoveBefore(Op, Source);
757  };
758 
759  // Insert the load at the point of the original dominating load.
760  LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
761  IRBuilder<NoFolder> IRB(DomLoad->getParent(),
762  ++BasicBlock::iterator(DomLoad));
763 
764  // Bitcast the pointer to a wider type and create the wide load, while making
765  // sure to maintain the original alignment as this prevents ldrd from being
766  // generated when it could be illegal due to memory alignment.
767  const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
768  Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
769  LoadTy->getPointerTo(AddrSpace));
770  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
771 
772  // Make sure everything is in the correct order in the basic block.
773  MoveBefore(Base->getPointerOperand(), VecPtr);
774  MoveBefore(VecPtr, WideLoad);
775 
776  // From the wide load, create two values that equal the original two loads.
777  // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
778  // TODO: Support big-endian as well.
779  Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
780  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
781  BaseSExt->replaceAllUsesWith(NewBaseSExt);
782 
783  IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
784  Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
785  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
786  Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
787  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
788  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
789 
790  LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
791  << *Base << "\n" << *Offset << "\n"
792  << "Created Wide Load:\n"
793  << *WideLoad << "\n"
794  << *Bottom << "\n"
795  << *NewBaseSExt << "\n"
796  << *Top << "\n"
797  << *Trunc << "\n"
798  << *NewOffsetSExt << "\n");
799  WideLoads.emplace(std::make_pair(Base,
800  std::make_unique<WidenedLoad>(Loads, WideLoad)));
801  return WideLoad;
802 }
803 
805  return new ARMParallelDSP();
806 }
807 
808 char ARMParallelDSP::ID = 0;
809 
810 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
811  "Transform functions to use DSP intrinsics", false, false)
812 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
813  "Transform functions to use DSP intrinsics", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
ARMSubtarget.h
use
Move duplicate certain instructions close to their use
Definition: Localizer.cpp:31
functions
amdgpu propagate attributes Late propagate attributes from kernels to functions
Definition: AMDGPUPropagateAttributes.cpp:196
AssumptionCache.h
llvm
This file implements support for optimizing divisions by a constant.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
NoFolder.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:190
llvm::ARMSubtarget
Definition: ARMSubtarget.h:46
Scalar.h
llvm::Function
Definition: Function.h:62
llvm::Value::dump
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4837
Pass.h
is64Bit
static bool is64Bit(const char *name)
Definition: X86Disassembler.cpp:1019
NumLoadLimit
static cl::opt< unsigned > NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed"))
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
LoopAccessAnalysis.h
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2657
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:460
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
GlobalsModRef.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
to
Should compile to
Definition: README.txt:449
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:333
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:111
dsp
arm parallel dsp
Definition: ARMParallelDSP.cpp:812
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1253
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
PassRegistry.h
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:198
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:115
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1335
llvm::AAResults
Definition: AliasAnalysis.h:508
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TargetLibraryInfo.h
false
Definition: StackSlotColoring.cpp:142
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::Instruction
Definition: Instruction.h:45
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:287
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:925
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2082
SmallPtrSet.h
PatternMatch.h
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:153
llvm::codeview::ClassOptions::Intrinsic
@ Intrinsic
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:190
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:273
llvm::pdb::OMFSegDescFlags::Read
@ Read
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:465
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
intrinsics
arm parallel Transform functions to use DSP intrinsics
Definition: ARMParallelDSP.cpp:813
TargetPassConfig.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
ARM.h
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:100
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::cl::Sink
@ Sink
Definition: CommandLine.h:171
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1748
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::RecurKind::Mul
@ Mul
Product of integers.
llvm::createARMParallelDSPPass
Pass * createARMParallelDSPPass()
Definition: ARMParallelDSP.cpp:804
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
get
Should compile to something r4 addze r3 instead we get
Definition: README.txt:24
llvm::LocationSize::beforeOrAfterPointer
constexpr static LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:129
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:324
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1492
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:776
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:221
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1336
DisableParallelDSP
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::GlobalsAAWrapperPass
Legacy wrapper pass to provide the GlobalsAAResult object.
Definition: GlobalsModRef.h:143
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::GlobalValue::getType
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:271
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:93
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:313
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::pdb::OMFSegDescFlags::Write
@ Write
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::cl::desc
Definition: CommandLine.h:412
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
BasicBlockUtils.h
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:237
Reduction
loop Loop Strength Reduction
Definition: LoopStrengthReduce.cpp:6408
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
Debug.h
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
Definition: AliasAnalysis.cpp:218
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:209
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37