LLVM  13.0.0git
ARMParallelDSP.cpp
Go to the documentation of this file.
1 //===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11 /// purpose of this pass is do some IR pattern matching to create ACLE
12 /// DSP intrinsics, which map on these 32-bit SIMD operations.
13 /// This pass runs only when unaligned accesses is supported/enabled.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "ARM.h"
18 #include "ARMSubtarget.h"
19 #include "llvm/ADT/SmallPtrSet.h"
20 #include "llvm/ADT/Statistic.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/IntrinsicsARM.h"
29 #include "llvm/IR/NoFolder.h"
30 #include "llvm/IR/PatternMatch.h"
31 #include "llvm/Pass.h"
32 #include "llvm/PassRegistry.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Transforms/Scalar.h"
36 
37 using namespace llvm;
38 using namespace PatternMatch;
39 
40 #define DEBUG_TYPE "arm-parallel-dsp"
41 
42 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
43 
44 static cl::opt<bool>
45 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
46  cl::desc("Disable the ARM Parallel DSP pass"));
47 
48 static cl::opt<unsigned>
49 NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
50  cl::desc("Limit the number of loads analysed"));
51 
52 namespace {
53  struct MulCandidate;
54  class Reduction;
55 
56  using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
57  using MemInstList = SmallVectorImpl<LoadInst*>;
59 
60  // 'MulCandidate' holds the multiplication instructions that are candidates
61  // for parallel execution.
62  struct MulCandidate {
63  Instruction *Root;
64  Value* LHS;
65  Value* RHS;
66  bool Exchange = false;
67  bool ReadOnly = true;
68  bool Paired = false;
69  SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
70 
71  MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
72  Root(I), LHS(lhs), RHS(rhs) { }
73 
74  bool HasTwoLoadInputs() const {
75  return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
76  }
77 
78  LoadInst *getBaseLoad() const {
79  return VecLd.front();
80  }
81  };
82 
83  /// Represent a sequence of multiply-accumulate operations with the aim to
84  /// perform the multiplications in parallel.
85  class Reduction {
86  Instruction *Root = nullptr;
87  Value *Acc = nullptr;
88  MulCandList Muls;
89  MulPairList MulPairs;
91 
92  public:
93  Reduction() = delete;
94 
95  Reduction (Instruction *Add) : Root(Add) { }
96 
97  /// Record an Add instruction that is a part of the this reduction.
98  void InsertAdd(Instruction *I) { Adds.insert(I); }
99 
100  /// Create MulCandidates, each rooted at a Mul instruction, that is a part
101  /// of this reduction.
102  void InsertMuls() {
103  auto GetMulOperand = [](Value *V) -> Instruction* {
104  if (auto *SExt = dyn_cast<SExtInst>(V)) {
105  if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
106  if (I->getOpcode() == Instruction::Mul)
107  return I;
108  } else if (auto *I = dyn_cast<Instruction>(V)) {
109  if (I->getOpcode() == Instruction::Mul)
110  return I;
111  }
112  return nullptr;
113  };
114 
115  auto InsertMul = [this](Instruction *I) {
116  Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
117  Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
118  Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
119  };
120 
121  for (auto *Add : Adds) {
122  if (Add == Acc)
123  continue;
124  if (auto *Mul = GetMulOperand(Add->getOperand(0)))
125  InsertMul(Mul);
126  if (auto *Mul = GetMulOperand(Add->getOperand(1)))
127  InsertMul(Mul);
128  }
129  }
130 
131  /// Add the incoming accumulator value, returns true if a value had not
132  /// already been added. Returning false signals to the user that this
133  /// reduction already has a value to initialise the accumulator.
134  bool InsertAcc(Value *V) {
135  if (Acc)
136  return false;
137  Acc = V;
138  return true;
139  }
140 
141  /// Set two MulCandidates, rooted at muls, that can be executed as a single
142  /// parallel operation.
143  void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
144  bool Exchange = false) {
145  LLVM_DEBUG(dbgs() << "Pairing:\n"
146  << *Mul0->Root << "\n"
147  << *Mul1->Root << "\n");
148  Mul0->Paired = true;
149  Mul1->Paired = true;
150  if (Exchange)
151  Mul1->Exchange = true;
152  MulPairs.push_back(std::make_pair(Mul0, Mul1));
153  }
154 
155  /// Return true if enough mul operations are found that can be executed in
156  /// parallel.
157  bool CreateParallelPairs();
158 
159  /// Return the add instruction which is the root of the reduction.
160  Instruction *getRoot() { return Root; }
161 
162  bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
163 
164  Type *getType() const { return Root->getType(); }
165 
166  /// Return the incoming value to be accumulated. This maybe null.
167  Value *getAccumulator() { return Acc; }
168 
169  /// Return the set of adds that comprise the reduction.
170  SetVector<Instruction*> &getAdds() { return Adds; }
171 
172  /// Return the MulCandidate, rooted at mul instruction, that comprise the
173  /// the reduction.
174  MulCandList &getMuls() { return Muls; }
175 
176  /// Return the MulCandidate, rooted at mul instructions, that have been
177  /// paired for parallel execution.
178  MulPairList &getMulPairs() { return MulPairs; }
179 
180  /// To finalise, replace the uses of the root with the intrinsic call.
181  void UpdateRoot(Instruction *SMLAD) {
182  Root->replaceAllUsesWith(SMLAD);
183  }
184 
185  void dump() {
186  LLVM_DEBUG(dbgs() << "Reduction:\n";
187  for (auto *Add : Adds)
188  LLVM_DEBUG(dbgs() << *Add << "\n");
189  for (auto &Mul : Muls)
190  LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
191  << " " << *Mul->LHS << "\n"
192  << " " << *Mul->RHS << "\n");
193  LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
194  );
195  }
196  };
197 
198  class WidenedLoad {
199  LoadInst *NewLd = nullptr;
201 
202  public:
203  WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
204  : NewLd(Wide) {
205  append_range(Loads, Lds);
206  }
207  LoadInst *getLoad() {
208  return NewLd;
209  }
210  };
211 
212  class ARMParallelDSP : public FunctionPass {
213  ScalarEvolution *SE;
214  AliasAnalysis *AA;
215  TargetLibraryInfo *TLI;
216  DominatorTree *DT;
217  const DataLayout *DL;
218  Module *M;
219  std::map<LoadInst*, LoadInst*> LoadPairs;
220  SmallPtrSet<LoadInst*, 4> OffsetLoads;
221  std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
222 
223  template<unsigned>
224  bool IsNarrowSequence(Value *V);
225  bool Search(Value *V, BasicBlock *BB, Reduction &R);
226  bool RecordMemoryOps(BasicBlock *BB);
227  void InsertParallelMACs(Reduction &Reduction);
228  bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
229  LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
230  bool CreateParallelPairs(Reduction &R);
231 
232  /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
233  /// Dual performs two signed 16x16-bit multiplications. It adds the
234  /// products to a 32-bit accumulate operand. Optionally, the instruction can
235  /// exchange the halfwords of the second operand before performing the
236  /// arithmetic.
237  bool MatchSMLAD(Function &F);
238 
239  public:
240  static char ID;
241 
242  ARMParallelDSP() : FunctionPass(ID) { }
243 
244  void getAnalysisUsage(AnalysisUsage &AU) const override {
254  AU.setPreservesCFG();
255  }
256 
257  bool runOnFunction(Function &F) override {
258  if (DisableParallelDSP)
259  return false;
260  if (skipFunction(F))
261  return false;
262 
263  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
264  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
265  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
266  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
267  auto &TPC = getAnalysis<TargetPassConfig>();
268 
269  M = F.getParent();
270  DL = &M->getDataLayout();
271 
272  auto &TM = TPC.getTM<TargetMachine>();
273  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
274 
275  if (!ST->allowsUnalignedMem()) {
276  LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
277  "running pass ARMParallelDSP\n");
278  return false;
279  }
280 
281  if (!ST->hasDSP()) {
282  LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
283  "ARMParallelDSP\n");
284  return false;
285  }
286 
287  if (!ST->isLittle()) {
288  LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
289  << "ARMParallelDSP\n");
290  return false;
291  }
292 
293  LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
294  LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
295 
296  bool Changes = MatchSMLAD(F);
297  return Changes;
298  }
299  };
300 }
301 
302 template<typename MemInst>
303 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
304  const DataLayout &DL, ScalarEvolution &SE) {
305  if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE))
306  return true;
307  return false;
308 }
309 
310 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
311  MemInstList &VecMem) {
312  if (!Ld0 || !Ld1)
313  return false;
314 
315  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
316  return false;
317 
318  LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
319  dbgs() << "Ld0:"; Ld0->dump();
320  dbgs() << "Ld1:"; Ld1->dump();
321  );
322 
323  VecMem.clear();
324  VecMem.push_back(Ld0);
325  VecMem.push_back(Ld1);
326  return true;
327 }
328 
329 // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
330 // instructions, which is set to 16. So here we should collect all i8 and i16
331 // narrow operations.
332 // TODO: we currently only collect i16, and will support i8 later, so that's
333 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
334 template<unsigned MaxBitWidth>
335 bool ARMParallelDSP::IsNarrowSequence(Value *V) {
336  if (auto *SExt = dyn_cast<SExtInst>(V)) {
337  if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
338  return false;
339 
340  if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
341  // Check that this load could be paired.
342  return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
343  }
344  }
345  return false;
346 }
347 
348 /// Iterate through the block and record base, offset pairs of loads which can
349 /// be widened into a single load.
350 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
353  LoadPairs.clear();
354  WideLoads.clear();
355 
356  // Collect loads and instruction that may write to memory. For now we only
357  // record loads which are simple, sign-extended and have a single user.
358  // TODO: Allow zero-extended loads.
359  for (auto &I : *BB) {
360  if (I.mayWriteToMemory())
361  Writes.push_back(&I);
362  auto *Ld = dyn_cast<LoadInst>(&I);
363  if (!Ld || !Ld->isSimple() ||
364  !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
365  continue;
366  Loads.push_back(Ld);
367  }
368 
369  if (Loads.empty() || Loads.size() > NumLoadLimit)
370  return false;
371 
372  using InstSet = std::set<Instruction*>;
373  using DepMap = std::map<Instruction*, InstSet>;
374  DepMap RAWDeps;
375 
376  // Record any writes that may alias a load.
378  for (auto Write : Writes) {
379  for (auto Read : Loads) {
380  MemoryLocation ReadLoc =
381  MemoryLocation(Read->getPointerOperand(), Size);
382 
383  if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
385  continue;
386  if (Write->comesBefore(Read))
387  RAWDeps[Read].insert(Write);
388  }
389  }
390 
391  // Check whether there's not a write between the two loads which would
392  // prevent them from being safely merged.
393  auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
394  bool BaseFirst = Base->comesBefore(Offset);
395  LoadInst *Dominator = BaseFirst ? Base : Offset;
396  LoadInst *Dominated = BaseFirst ? Offset : Base;
397 
398  if (RAWDeps.count(Dominated)) {
399  InstSet &WritesBefore = RAWDeps[Dominated];
400 
401  for (auto Before : WritesBefore) {
402  // We can't move the second load backward, past a write, to merge
403  // with the first load.
404  if (Dominator->comesBefore(Before))
405  return false;
406  }
407  }
408  return true;
409  };
410 
411  // Record base, offset load pairs.
412  for (auto *Base : Loads) {
413  for (auto *Offset : Loads) {
414  if (Base == Offset || OffsetLoads.count(Offset))
415  continue;
416 
417  if (AreSequentialAccesses<LoadInst>(Base, Offset, *DL, *SE) &&
418  SafeToPair(Base, Offset)) {
419  LoadPairs[Base] = Offset;
420  OffsetLoads.insert(Offset);
421  break;
422  }
423  }
424  }
425 
426  LLVM_DEBUG(if (!LoadPairs.empty()) {
427  dbgs() << "Consecutive load pairs:\n";
428  for (auto &MapIt : LoadPairs) {
429  LLVM_DEBUG(dbgs() << *MapIt.first << ", "
430  << *MapIt.second << "\n");
431  }
432  });
433  return LoadPairs.size() > 1;
434 }
435 
436 // Search recursively back through the operands to find a tree of values that
437 // form a multiply-accumulate chain. The search records the Add and Mul
438 // instructions that form the reduction and allows us to find a single value
439 // to be used as the initial input to the accumlator.
440 bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
441  // If we find a non-instruction, try to use it as the initial accumulator
442  // value. This may have already been found during the search in which case
443  // this function will return false, signaling a search fail.
444  auto *I = dyn_cast<Instruction>(V);
445  if (!I)
446  return R.InsertAcc(V);
447 
448  if (I->getParent() != BB)
449  return false;
450 
451  switch (I->getOpcode()) {
452  default:
453  break;
454  case Instruction::PHI:
455  // Could be the accumulator value.
456  return R.InsertAcc(V);
457  case Instruction::Add: {
458  // Adds should be adding together two muls, or another add and a mul to
459  // be within the mac chain. One of the operands may also be the
460  // accumulator value at which point we should stop searching.
461  R.InsertAdd(I);
462  Value *LHS = I->getOperand(0);
463  Value *RHS = I->getOperand(1);
464  bool ValidLHS = Search(LHS, BB, R);
465  bool ValidRHS = Search(RHS, BB, R);
466 
467  if (ValidLHS && ValidRHS)
468  return true;
469 
470  return R.InsertAcc(I);
471  }
472  case Instruction::Mul: {
473  Value *MulOp0 = I->getOperand(0);
474  Value *MulOp1 = I->getOperand(1);
475  return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
476  }
477  case Instruction::SExt:
478  return Search(I->getOperand(0), BB, R);
479  }
480  return false;
481 }
482 
483 // The pass needs to identify integer add/sub reductions of 16-bit vector
484 // multiplications.
485 // To use SMLAD:
486 // 1) we first need to find integer add then look for this pattern:
487 //
488 // acc0 = ...
489 // ld0 = load i16
490 // sext0 = sext i16 %ld0 to i32
491 // ld1 = load i16
492 // sext1 = sext i16 %ld1 to i32
493 // mul0 = mul %sext0, %sext1
494 // ld2 = load i16
495 // sext2 = sext i16 %ld2 to i32
496 // ld3 = load i16
497 // sext3 = sext i16 %ld3 to i32
498 // mul1 = mul i32 %sext2, %sext3
499 // add0 = add i32 %mul0, %acc0
500 // acc1 = add i32 %add0, %mul1
501 //
502 // Which can be selected to:
503 //
504 // ldr r0
505 // ldr r1
506 // smlad r2, r0, r1, r2
507 //
508 // If constants are used instead of loads, these will need to be hoisted
509 // out and into a register.
510 //
511 // If loop invariants are used instead of loads, these need to be packed
512 // before the loop begins.
513 //
514 bool ARMParallelDSP::MatchSMLAD(Function &F) {
515  bool Changed = false;
516 
517  for (auto &BB : F) {
519  if (!RecordMemoryOps(&BB))
520  continue;
521 
522  for (Instruction &I : reverse(BB)) {
523  if (I.getOpcode() != Instruction::Add)
524  continue;
525 
526  if (AllAdds.count(&I))
527  continue;
528 
529  const auto *Ty = I.getType();
530  if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
531  continue;
532 
533  Reduction R(&I);
534  if (!Search(&I, &BB, R))
535  continue;
536 
537  R.InsertMuls();
538  LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
539 
540  if (!CreateParallelPairs(R))
541  continue;
542 
543  InsertParallelMACs(R);
544  Changed = true;
545  AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
546  }
547  }
548 
549  return Changed;
550 }
551 
552 bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
553 
554  // Not enough mul operations to make a pair.
555  if (R.getMuls().size() < 2)
556  return false;
557 
558  // Check that the muls operate directly upon sign extended loads.
559  for (auto &MulCand : R.getMuls()) {
560  if (!MulCand->HasTwoLoadInputs())
561  return false;
562  }
563 
564  auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
565  // The first elements of each vector should be loads with sexts. If we
566  // find that its two pairs of consecutive loads, then these can be
567  // transformed into two wider loads and the users can be replaced with
568  // DSP intrinsics.
569  auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
570  auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
571  auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
572  auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
573 
574  // Check that each mul is operating on two different loads.
575  if (Ld0 == Ld2 || Ld1 == Ld3)
576  return false;
577 
578  if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
579  if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
580  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
581  R.AddMulPair(PMul0, PMul1);
582  return true;
583  } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
584  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
585  LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
586  R.AddMulPair(PMul0, PMul1, true);
587  return true;
588  }
589  } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
590  AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
591  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
592  LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
593  LLVM_DEBUG(dbgs() << " and swapping muls\n");
594  // Only the second operand can be exchanged, so swap the muls.
595  R.AddMulPair(PMul1, PMul0, true);
596  return true;
597  }
598  return false;
599  };
600 
601  MulCandList &Muls = R.getMuls();
602  const unsigned Elems = Muls.size();
603  for (unsigned i = 0; i < Elems; ++i) {
604  MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
605  if (PMul0->Paired)
606  continue;
607 
608  for (unsigned j = 0; j < Elems; ++j) {
609  if (i == j)
610  continue;
611 
612  MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
613  if (PMul1->Paired)
614  continue;
615 
616  const Instruction *Mul0 = PMul0->Root;
617  const Instruction *Mul1 = PMul1->Root;
618  if (Mul0 == Mul1)
619  continue;
620 
621  assert(PMul0 != PMul1 && "expected different chains");
622 
623  if (CanPair(R, PMul0, PMul1))
624  break;
625  }
626  }
627  return !R.getMulPairs().empty();
628 }
629 
630 void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
631 
632  auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
633  Value *Acc, bool Exchange,
634  Instruction *InsertAfter) {
635  // Replace the reduction chain with an intrinsic call
636 
637  Value* Args[] = { WideLd0, WideLd1, Acc };
638  Function *SMLAD = nullptr;
639  if (Exchange)
640  SMLAD = Acc->getType()->isIntegerTy(32) ?
641  Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
642  Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
643  else
644  SMLAD = Acc->getType()->isIntegerTy(32) ?
645  Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
646  Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
647 
648  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
649  BasicBlock::iterator(InsertAfter));
650  Instruction *Call = Builder.CreateCall(SMLAD, Args);
651  NumSMLAD++;
652  return Call;
653  };
654 
655  // Return the instruction after the dominated instruction.
656  auto GetInsertPoint = [this](Value *A, Value *B) {
657  assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
658  "expected at least one instruction");
659 
660  Value *V = nullptr;
661  if (!isa<Instruction>(A))
662  V = B;
663  else if (!isa<Instruction>(B))
664  V = A;
665  else
666  V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
667 
668  return &*++BasicBlock::iterator(cast<Instruction>(V));
669  };
670 
671  Value *Acc = R.getAccumulator();
672 
673  // For any muls that were discovered but not paired, accumulate their values
674  // as before.
675  IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
676  MulCandList &MulCands = R.getMuls();
677  for (auto &MulCand : MulCands) {
678  if (MulCand->Paired)
679  continue;
680 
681  Instruction *Mul = cast<Instruction>(MulCand->Root);
682  LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
683 
684  if (R.getType() != Mul->getType()) {
685  assert(R.is64Bit() && "expected 64-bit result");
686  Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
687  Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
688  }
689 
690  if (!Acc) {
691  Acc = Mul;
692  continue;
693  }
694 
695  // If Acc is the original incoming value to the reduction, it could be a
696  // phi. But the phi will dominate Mul, meaning that Mul will be the
697  // insertion point.
698  Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
699  Acc = Builder.CreateAdd(Mul, Acc);
700  }
701 
702  if (!Acc) {
703  Acc = R.is64Bit() ?
704  ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
705  ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
706  } else if (Acc->getType() != R.getType()) {
707  Builder.SetInsertPoint(R.getRoot());
708  Acc = Builder.CreateSExt(Acc, R.getType());
709  }
710 
711  // Roughly sort the mul pairs in their program order.
712  llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
713  const Instruction *A = PairA.first->Root;
714  const Instruction *B = PairB.first->Root;
715  return A->comesBefore(B);
716  });
717 
718  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
719  for (auto &Pair : R.getMulPairs()) {
720  MulCandidate *LHSMul = Pair.first;
721  MulCandidate *RHSMul = Pair.second;
722  LoadInst *BaseLHS = LHSMul->getBaseLoad();
723  LoadInst *BaseRHS = RHSMul->getBaseLoad();
724  LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
725  WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
726  LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
727  WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
728 
729  Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
730  InsertAfter = GetInsertPoint(InsertAfter, Acc);
731  Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
732  }
733  R.UpdateRoot(cast<Instruction>(Acc));
734 }
735 
736 LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
737  IntegerType *LoadTy) {
738  assert(Loads.size() == 2 && "currently only support widening two loads");
739 
740  LoadInst *Base = Loads[0];
741  LoadInst *Offset = Loads[1];
742 
743  Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
744  Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
745 
746  assert((BaseSExt && OffsetSExt)
747  && "Loads should have a single, extending, user");
748 
749  std::function<void(Value*, Value*)> MoveBefore =
750  [&](Value *A, Value *B) -> void {
751  if (!isa<Instruction>(A) || !isa<Instruction>(B))
752  return;
753 
754  auto *Source = cast<Instruction>(A);
755  auto *Sink = cast<Instruction>(B);
756 
757  if (DT->dominates(Source, Sink) ||
758  Source->getParent() != Sink->getParent() ||
759  isa<PHINode>(Source) || isa<PHINode>(Sink))
760  return;
761 
762  Source->moveBefore(Sink);
763  for (auto &Op : Source->operands())
764  MoveBefore(Op, Source);
765  };
766 
767  // Insert the load at the point of the original dominating load.
768  LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
769  IRBuilder<NoFolder> IRB(DomLoad->getParent(),
770  ++BasicBlock::iterator(DomLoad));
771 
772  // Bitcast the pointer to a wider type and create the wide load, while making
773  // sure to maintain the original alignment as this prevents ldrd from being
774  // generated when it could be illegal due to memory alignment.
775  const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
776  Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
777  LoadTy->getPointerTo(AddrSpace));
778  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
779 
780  // Make sure everything is in the correct order in the basic block.
781  MoveBefore(Base->getPointerOperand(), VecPtr);
782  MoveBefore(VecPtr, WideLoad);
783 
784  // From the wide load, create two values that equal the original two loads.
785  // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
786  // TODO: Support big-endian as well.
787  Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
788  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
789  BaseSExt->replaceAllUsesWith(NewBaseSExt);
790 
791  IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
792  Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
793  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
794  Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
795  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
796  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
797 
798  LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
799  << *Base << "\n" << *Offset << "\n"
800  << "Created Wide Load:\n"
801  << *WideLoad << "\n"
802  << *Bottom << "\n"
803  << *NewBaseSExt << "\n"
804  << *Top << "\n"
805  << *Trunc << "\n"
806  << *NewOffsetSExt << "\n");
807  WideLoads.emplace(std::make_pair(Base,
808  std::make_unique<WidenedLoad>(Loads, WideLoad)));
809  return WideLoad;
810 }
811 
813  return new ARMParallelDSP();
814 }
815 
816 char ARMParallelDSP::ID = 0;
817 
818 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
819  "Transform functions to use DSP intrinsics", false, false)
820 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
821  "Transform functions to use DSP intrinsics", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
ARMSubtarget.h
use
Move duplicate certain instructions close to their use
Definition: Localizer.cpp:31
functions
amdgpu propagate attributes Late propagate attributes from kernels to functions
Definition: AMDGPUPropagateAttributes.cpp:199
AssumptionCache.h
llvm
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
NoFolder.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1318
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:189
llvm::ARMSubtarget
Definition: ARMSubtarget.h:46
Scalar.h
llvm::Function
Definition: Function.h:61
AreSequentialAccesses
static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, const DataLayout &DL, ScalarEvolution &SE)
Definition: ARMParallelDSP.cpp:303
llvm::Value::dump
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4764
Pass.h
is64Bit
static bool is64Bit(const char *name)
Definition: X86Disassembler.cpp:1005
NumLoadLimit
static cl::opt< unsigned > NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed"))
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
Statistic.h
LoopAccessAnalysis.h
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2586
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
GlobalsModRef.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
to
Should compile to
Definition: README.txt:449
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:329
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:111
dsp
arm parallel dsp
Definition: ARMParallelDSP.cpp:820
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1235
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:266
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:222
PassRegistry.h
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:133
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:115
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1274
llvm::AAResults
Definition: AliasAnalysis.h:456
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TargetLibraryInfo.h
false
Definition: StackSlotColoring.cpp:142
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::Instruction
Definition: Instruction.h:45
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:281
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:885
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2104
SmallPtrSet.h
PatternMatch.h
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:153
llvm::codeview::ClassOptions::Intrinsic
@ Intrinsic
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:202
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:272
llvm::pdb::OMFSegDescFlags::Read
@ Read
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:446
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::Instruction::user_back
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:91
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
intrinsics
arm parallel Transform functions to use DSP intrinsics
Definition: ARMParallelDSP.cpp:821
TargetPassConfig.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:83
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
ARM.h
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:649
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::cl::Sink
@ Sink
Definition: CommandLine.h:171
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:527
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1667
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::RecurKind::Mul
@ Mul
Product of integers.
llvm::createARMParallelDSPPass
Pass * createARMParallelDSPPass()
Definition: ARMParallelDSP.cpp:812
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
get
Should compile to something r4 addze r3 instead we get
Definition: README.txt:24
llvm::LocationSize::beforeOrAfterPointer
constexpr static LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:129
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1423
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:709
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:207
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:585
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1269
DisableParallelDSP
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::GlobalsAAWrapperPass
Legacy wrapper pass to provide the GlobalsAAResult object.
Definition: GlobalsModRef.h:143
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:71
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::GlobalValue::getType
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:271
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:93
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:269
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::pdb::OMFSegDescFlags::Write
@ Write
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:389
llvm::cl::desc
Definition: CommandLine.h:414
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
BasicBlockUtils.h
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:236
Reduction
loop Loop Strength Reduction
Definition: LoopStrengthReduce.cpp:6004
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
Debug.h
llvm::AAResults::getModRefInfo
ModRefInfo getModRefInfo(const CallBase *Call, const MemoryLocation &Loc)
getModRefInfo (for call sites) - Return information about whether a particular call site modifies or ...
Definition: AliasAnalysis.cpp:217
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:209
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38