LLVM  15.0.0git
ARMParallelDSP.cpp
Go to the documentation of this file.
1 //===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
11 /// purpose of this pass is do some IR pattern matching to create ACLE
12 /// DSP intrinsics, which map on these 32-bit SIMD operations.
13 /// This pass runs only when unaligned accesses is supported/enabled.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "ARM.h"
18 #include "ARMSubtarget.h"
19 #include "llvm/ADT/SmallPtrSet.h"
20 #include "llvm/ADT/Statistic.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/IntrinsicsARM.h"
29 #include "llvm/IR/NoFolder.h"
30 #include "llvm/IR/PatternMatch.h"
31 #include "llvm/Pass.h"
32 #include "llvm/PassRegistry.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Transforms/Scalar.h"
36 
37 using namespace llvm;
38 using namespace PatternMatch;
39 
40 #define DEBUG_TYPE "arm-parallel-dsp"
41 
42 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
43 
44 static cl::opt<bool>
45 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
46  cl::desc("Disable the ARM Parallel DSP pass"));
47 
48 static cl::opt<unsigned>
49 NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16),
50  cl::desc("Limit the number of loads analysed"));
51 
52 namespace {
53  struct MulCandidate;
54  class Reduction;
55 
56  using MulCandList = SmallVector<std::unique_ptr<MulCandidate>, 8>;
57  using MemInstList = SmallVectorImpl<LoadInst*>;
59 
60  // 'MulCandidate' holds the multiplication instructions that are candidates
61  // for parallel execution.
62  struct MulCandidate {
63  Instruction *Root;
64  Value* LHS;
65  Value* RHS;
66  bool Exchange = false;
67  bool ReadOnly = true;
68  bool Paired = false;
69  SmallVector<LoadInst*, 2> VecLd; // Container for loads to widen.
70 
71  MulCandidate(Instruction *I, Value *lhs, Value *rhs) :
72  Root(I), LHS(lhs), RHS(rhs) { }
73 
74  bool HasTwoLoadInputs() const {
75  return isa<LoadInst>(LHS) && isa<LoadInst>(RHS);
76  }
77 
78  LoadInst *getBaseLoad() const {
79  return VecLd.front();
80  }
81  };
82 
83  /// Represent a sequence of multiply-accumulate operations with the aim to
84  /// perform the multiplications in parallel.
85  class Reduction {
86  Instruction *Root = nullptr;
87  Value *Acc = nullptr;
88  MulCandList Muls;
89  MulPairList MulPairs;
91 
92  public:
93  Reduction() = delete;
94 
95  Reduction (Instruction *Add) : Root(Add) { }
96 
97  /// Record an Add instruction that is a part of the this reduction.
98  void InsertAdd(Instruction *I) { Adds.insert(I); }
99 
100  /// Create MulCandidates, each rooted at a Mul instruction, that is a part
101  /// of this reduction.
102  void InsertMuls() {
103  auto GetMulOperand = [](Value *V) -> Instruction* {
104  if (auto *SExt = dyn_cast<SExtInst>(V)) {
105  if (auto *I = dyn_cast<Instruction>(SExt->getOperand(0)))
106  if (I->getOpcode() == Instruction::Mul)
107  return I;
108  } else if (auto *I = dyn_cast<Instruction>(V)) {
109  if (I->getOpcode() == Instruction::Mul)
110  return I;
111  }
112  return nullptr;
113  };
114 
115  auto InsertMul = [this](Instruction *I) {
116  Value *LHS = cast<Instruction>(I->getOperand(0))->getOperand(0);
117  Value *RHS = cast<Instruction>(I->getOperand(1))->getOperand(0);
118  Muls.push_back(std::make_unique<MulCandidate>(I, LHS, RHS));
119  };
120 
121  for (auto *Add : Adds) {
122  if (Add == Acc)
123  continue;
124  if (auto *Mul = GetMulOperand(Add->getOperand(0)))
125  InsertMul(Mul);
126  if (auto *Mul = GetMulOperand(Add->getOperand(1)))
127  InsertMul(Mul);
128  }
129  }
130 
131  /// Add the incoming accumulator value, returns true if a value had not
132  /// already been added. Returning false signals to the user that this
133  /// reduction already has a value to initialise the accumulator.
134  bool InsertAcc(Value *V) {
135  if (Acc)
136  return false;
137  Acc = V;
138  return true;
139  }
140 
141  /// Set two MulCandidates, rooted at muls, that can be executed as a single
142  /// parallel operation.
143  void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1,
144  bool Exchange = false) {
145  LLVM_DEBUG(dbgs() << "Pairing:\n"
146  << *Mul0->Root << "\n"
147  << *Mul1->Root << "\n");
148  Mul0->Paired = true;
149  Mul1->Paired = true;
150  if (Exchange)
151  Mul1->Exchange = true;
152  MulPairs.push_back(std::make_pair(Mul0, Mul1));
153  }
154 
155  /// Return true if enough mul operations are found that can be executed in
156  /// parallel.
157  bool CreateParallelPairs();
158 
159  /// Return the add instruction which is the root of the reduction.
160  Instruction *getRoot() { return Root; }
161 
162  bool is64Bit() const { return Root->getType()->isIntegerTy(64); }
163 
164  Type *getType() const { return Root->getType(); }
165 
166  /// Return the incoming value to be accumulated. This maybe null.
167  Value *getAccumulator() { return Acc; }
168 
169  /// Return the set of adds that comprise the reduction.
170  SetVector<Instruction*> &getAdds() { return Adds; }
171 
172  /// Return the MulCandidate, rooted at mul instruction, that comprise the
173  /// the reduction.
174  MulCandList &getMuls() { return Muls; }
175 
176  /// Return the MulCandidate, rooted at mul instructions, that have been
177  /// paired for parallel execution.
178  MulPairList &getMulPairs() { return MulPairs; }
179 
180  /// To finalise, replace the uses of the root with the intrinsic call.
181  void UpdateRoot(Instruction *SMLAD) {
182  Root->replaceAllUsesWith(SMLAD);
183  }
184 
185  void dump() {
186  LLVM_DEBUG(dbgs() << "Reduction:\n";
187  for (auto *Add : Adds)
188  LLVM_DEBUG(dbgs() << *Add << "\n");
189  for (auto &Mul : Muls)
190  LLVM_DEBUG(dbgs() << *Mul->Root << "\n"
191  << " " << *Mul->LHS << "\n"
192  << " " << *Mul->RHS << "\n");
193  LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n")
194  );
195  }
196  };
197 
198  class WidenedLoad {
199  LoadInst *NewLd = nullptr;
201 
202  public:
203  WidenedLoad(SmallVectorImpl<LoadInst*> &Lds, LoadInst *Wide)
204  : NewLd(Wide) {
205  append_range(Loads, Lds);
206  }
207  LoadInst *getLoad() {
208  return NewLd;
209  }
210  };
211 
212  class ARMParallelDSP : public FunctionPass {
213  ScalarEvolution *SE;
214  AliasAnalysis *AA;
215  TargetLibraryInfo *TLI;
216  DominatorTree *DT;
217  const DataLayout *DL;
218  Module *M;
219  std::map<LoadInst*, LoadInst*> LoadPairs;
220  SmallPtrSet<LoadInst*, 4> OffsetLoads;
221  std::map<LoadInst*, std::unique_ptr<WidenedLoad>> WideLoads;
222 
223  template<unsigned>
224  bool IsNarrowSequence(Value *V);
225  bool Search(Value *V, BasicBlock *BB, Reduction &R);
226  bool RecordMemoryOps(BasicBlock *BB);
227  void InsertParallelMACs(Reduction &Reduction);
228  bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
229  LoadInst* CreateWideLoad(MemInstList &Loads, IntegerType *LoadTy);
230  bool CreateParallelPairs(Reduction &R);
231 
232  /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
233  /// Dual performs two signed 16x16-bit multiplications. It adds the
234  /// products to a 32-bit accumulate operand. Optionally, the instruction can
235  /// exchange the halfwords of the second operand before performing the
236  /// arithmetic.
237  bool MatchSMLAD(Function &F);
238 
239  public:
240  static char ID;
241 
242  ARMParallelDSP() : FunctionPass(ID) { }
243 
244  void getAnalysisUsage(AnalysisUsage &AU) const override {
254  AU.setPreservesCFG();
255  }
256 
257  bool runOnFunction(Function &F) override {
258  if (DisableParallelDSP)
259  return false;
260  if (skipFunction(F))
261  return false;
262 
263  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
264  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
265  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
266  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
267  auto &TPC = getAnalysis<TargetPassConfig>();
268 
269  M = F.getParent();
270  DL = &M->getDataLayout();
271 
272  auto &TM = TPC.getTM<TargetMachine>();
273  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
274 
275  if (!ST->allowsUnalignedMem()) {
276  LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
277  "running pass ARMParallelDSP\n");
278  return false;
279  }
280 
281  if (!ST->hasDSP()) {
282  LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
283  "ARMParallelDSP\n");
284  return false;
285  }
286 
287  if (!ST->isLittle()) {
288  LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass "
289  << "ARMParallelDSP\n");
290  return false;
291  }
292 
293  LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
294  LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
295 
296  bool Changes = MatchSMLAD(F);
297  return Changes;
298  }
299  };
300 }
301 
302 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
303  MemInstList &VecMem) {
304  if (!Ld0 || !Ld1)
305  return false;
306 
307  if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
308  return false;
309 
310  LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n";
311  dbgs() << "Ld0:"; Ld0->dump();
312  dbgs() << "Ld1:"; Ld1->dump();
313  );
314 
315  VecMem.clear();
316  VecMem.push_back(Ld0);
317  VecMem.push_back(Ld1);
318  return true;
319 }
320 
321 // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
322 // instructions, which is set to 16. So here we should collect all i8 and i16
323 // narrow operations.
324 // TODO: we currently only collect i16, and will support i8 later, so that's
325 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
326 template<unsigned MaxBitWidth>
327 bool ARMParallelDSP::IsNarrowSequence(Value *V) {
328  if (auto *SExt = dyn_cast<SExtInst>(V)) {
329  if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth)
330  return false;
331 
332  if (auto *Ld = dyn_cast<LoadInst>(SExt->getOperand(0))) {
333  // Check that this load could be paired.
334  return LoadPairs.count(Ld) || OffsetLoads.count(Ld);
335  }
336  }
337  return false;
338 }
339 
340 /// Iterate through the block and record base, offset pairs of loads which can
341 /// be widened into a single load.
342 bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
345  LoadPairs.clear();
346  WideLoads.clear();
347 
348  // Collect loads and instruction that may write to memory. For now we only
349  // record loads which are simple, sign-extended and have a single user.
350  // TODO: Allow zero-extended loads.
351  for (auto &I : *BB) {
352  if (I.mayWriteToMemory())
353  Writes.push_back(&I);
354  auto *Ld = dyn_cast<LoadInst>(&I);
355  if (!Ld || !Ld->isSimple() ||
356  !Ld->hasOneUse() || !isa<SExtInst>(Ld->user_back()))
357  continue;
358  Loads.push_back(Ld);
359  }
360 
361  if (Loads.empty() || Loads.size() > NumLoadLimit)
362  return false;
363 
364  using InstSet = std::set<Instruction*>;
365  using DepMap = std::map<Instruction*, InstSet>;
366  DepMap RAWDeps;
367 
368  // Record any writes that may alias a load.
370  for (auto Write : Writes) {
371  for (auto Read : Loads) {
372  MemoryLocation ReadLoc =
373  MemoryLocation(Read->getPointerOperand(), Size);
374 
375  if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
377  continue;
378  if (Write->comesBefore(Read))
379  RAWDeps[Read].insert(Write);
380  }
381  }
382 
383  // Check whether there's not a write between the two loads which would
384  // prevent them from being safely merged.
385  auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
386  bool BaseFirst = Base->comesBefore(Offset);
387  LoadInst *Dominator = BaseFirst ? Base : Offset;
388  LoadInst *Dominated = BaseFirst ? Offset : Base;
389 
390  if (RAWDeps.count(Dominated)) {
391  InstSet &WritesBefore = RAWDeps[Dominated];
392 
393  for (auto Before : WritesBefore) {
394  // We can't move the second load backward, past a write, to merge
395  // with the first load.
396  if (Dominator->comesBefore(Before))
397  return false;
398  }
399  }
400  return true;
401  };
402 
403  // Record base, offset load pairs.
404  for (auto *Base : Loads) {
405  for (auto *Offset : Loads) {
406  if (Base == Offset || OffsetLoads.count(Offset))
407  continue;
408 
409  if (isConsecutiveAccess(Base, Offset, *DL, *SE) &&
410  SafeToPair(Base, Offset)) {
411  LoadPairs[Base] = Offset;
412  OffsetLoads.insert(Offset);
413  break;
414  }
415  }
416  }
417 
418  LLVM_DEBUG(if (!LoadPairs.empty()) {
419  dbgs() << "Consecutive load pairs:\n";
420  for (auto &MapIt : LoadPairs) {
421  LLVM_DEBUG(dbgs() << *MapIt.first << ", "
422  << *MapIt.second << "\n");
423  }
424  });
425  return LoadPairs.size() > 1;
426 }
427 
428 // Search recursively back through the operands to find a tree of values that
429 // form a multiply-accumulate chain. The search records the Add and Mul
430 // instructions that form the reduction and allows us to find a single value
431 // to be used as the initial input to the accumlator.
432 bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) {
433  // If we find a non-instruction, try to use it as the initial accumulator
434  // value. This may have already been found during the search in which case
435  // this function will return false, signaling a search fail.
436  auto *I = dyn_cast<Instruction>(V);
437  if (!I)
438  return R.InsertAcc(V);
439 
440  if (I->getParent() != BB)
441  return false;
442 
443  switch (I->getOpcode()) {
444  default:
445  break;
446  case Instruction::PHI:
447  // Could be the accumulator value.
448  return R.InsertAcc(V);
449  case Instruction::Add: {
450  // Adds should be adding together two muls, or another add and a mul to
451  // be within the mac chain. One of the operands may also be the
452  // accumulator value at which point we should stop searching.
453  R.InsertAdd(I);
454  Value *LHS = I->getOperand(0);
455  Value *RHS = I->getOperand(1);
456  bool ValidLHS = Search(LHS, BB, R);
457  bool ValidRHS = Search(RHS, BB, R);
458 
459  if (ValidLHS && ValidRHS)
460  return true;
461 
462  // Ensure we don't add the root as the incoming accumulator.
463  if (R.getRoot() == I)
464  return false;
465 
466  return R.InsertAcc(I);
467  }
468  case Instruction::Mul: {
469  Value *MulOp0 = I->getOperand(0);
470  Value *MulOp1 = I->getOperand(1);
471  return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1);
472  }
473  case Instruction::SExt:
474  return Search(I->getOperand(0), BB, R);
475  }
476  return false;
477 }
478 
479 // The pass needs to identify integer add/sub reductions of 16-bit vector
480 // multiplications.
481 // To use SMLAD:
482 // 1) we first need to find integer add then look for this pattern:
483 //
484 // acc0 = ...
485 // ld0 = load i16
486 // sext0 = sext i16 %ld0 to i32
487 // ld1 = load i16
488 // sext1 = sext i16 %ld1 to i32
489 // mul0 = mul %sext0, %sext1
490 // ld2 = load i16
491 // sext2 = sext i16 %ld2 to i32
492 // ld3 = load i16
493 // sext3 = sext i16 %ld3 to i32
494 // mul1 = mul i32 %sext2, %sext3
495 // add0 = add i32 %mul0, %acc0
496 // acc1 = add i32 %add0, %mul1
497 //
498 // Which can be selected to:
499 //
500 // ldr r0
501 // ldr r1
502 // smlad r2, r0, r1, r2
503 //
504 // If constants are used instead of loads, these will need to be hoisted
505 // out and into a register.
506 //
507 // If loop invariants are used instead of loads, these need to be packed
508 // before the loop begins.
509 //
510 bool ARMParallelDSP::MatchSMLAD(Function &F) {
511  bool Changed = false;
512 
513  for (auto &BB : F) {
515  if (!RecordMemoryOps(&BB))
516  continue;
517 
518  for (Instruction &I : reverse(BB)) {
519  if (I.getOpcode() != Instruction::Add)
520  continue;
521 
522  if (AllAdds.count(&I))
523  continue;
524 
525  const auto *Ty = I.getType();
526  if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
527  continue;
528 
529  Reduction R(&I);
530  if (!Search(&I, &BB, R))
531  continue;
532 
533  R.InsertMuls();
534  LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump());
535 
536  if (!CreateParallelPairs(R))
537  continue;
538 
539  InsertParallelMACs(R);
540  Changed = true;
541  AllAdds.insert(R.getAdds().begin(), R.getAdds().end());
542  LLVM_DEBUG(dbgs() << "BB after inserting parallel MACs:\n" << BB);
543  }
544  }
545 
546  return Changed;
547 }
548 
549 bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
550 
551  // Not enough mul operations to make a pair.
552  if (R.getMuls().size() < 2)
553  return false;
554 
555  // Check that the muls operate directly upon sign extended loads.
556  for (auto &MulCand : R.getMuls()) {
557  if (!MulCand->HasTwoLoadInputs())
558  return false;
559  }
560 
561  auto CanPair = [&](Reduction &R, MulCandidate *PMul0, MulCandidate *PMul1) {
562  // The first elements of each vector should be loads with sexts. If we
563  // find that its two pairs of consecutive loads, then these can be
564  // transformed into two wider loads and the users can be replaced with
565  // DSP intrinsics.
566  auto Ld0 = static_cast<LoadInst*>(PMul0->LHS);
567  auto Ld1 = static_cast<LoadInst*>(PMul1->LHS);
568  auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
569  auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
570 
571  // Check that each mul is operating on two different loads.
572  if (Ld0 == Ld2 || Ld1 == Ld3)
573  return false;
574 
575  if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
576  if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
577  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
578  R.AddMulPair(PMul0, PMul1);
579  return true;
580  } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
581  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
582  LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
583  R.AddMulPair(PMul0, PMul1, true);
584  return true;
585  }
586  } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
587  AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
588  LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
589  LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
590  LLVM_DEBUG(dbgs() << " and swapping muls\n");
591  // Only the second operand can be exchanged, so swap the muls.
592  R.AddMulPair(PMul1, PMul0, true);
593  return true;
594  }
595  return false;
596  };
597 
598  MulCandList &Muls = R.getMuls();
599  const unsigned Elems = Muls.size();
600  for (unsigned i = 0; i < Elems; ++i) {
601  MulCandidate *PMul0 = static_cast<MulCandidate*>(Muls[i].get());
602  if (PMul0->Paired)
603  continue;
604 
605  for (unsigned j = 0; j < Elems; ++j) {
606  if (i == j)
607  continue;
608 
609  MulCandidate *PMul1 = static_cast<MulCandidate*>(Muls[j].get());
610  if (PMul1->Paired)
611  continue;
612 
613  const Instruction *Mul0 = PMul0->Root;
614  const Instruction *Mul1 = PMul1->Root;
615  if (Mul0 == Mul1)
616  continue;
617 
618  assert(PMul0 != PMul1 && "expected different chains");
619 
620  if (CanPair(R, PMul0, PMul1))
621  break;
622  }
623  }
624  return !R.getMulPairs().empty();
625 }
626 
627 void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
628 
629  auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1,
630  Value *Acc, bool Exchange,
631  Instruction *InsertAfter) {
632  // Replace the reduction chain with an intrinsic call
633 
634  Value* Args[] = { WideLd0, WideLd1, Acc };
635  Function *SMLAD = nullptr;
636  if (Exchange)
637  SMLAD = Acc->getType()->isIntegerTy(32) ?
638  Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
639  Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
640  else
641  SMLAD = Acc->getType()->isIntegerTy(32) ?
642  Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
643  Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
644 
645  IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
646  BasicBlock::iterator(InsertAfter));
647  Instruction *Call = Builder.CreateCall(SMLAD, Args);
648  NumSMLAD++;
649  return Call;
650  };
651 
652  // Return the instruction after the dominated instruction.
653  auto GetInsertPoint = [this](Value *A, Value *B) {
654  assert((isa<Instruction>(A) || isa<Instruction>(B)) &&
655  "expected at least one instruction");
656 
657  Value *V = nullptr;
658  if (!isa<Instruction>(A))
659  V = B;
660  else if (!isa<Instruction>(B))
661  V = A;
662  else
663  V = DT->dominates(cast<Instruction>(A), cast<Instruction>(B)) ? B : A;
664 
665  return &*++BasicBlock::iterator(cast<Instruction>(V));
666  };
667 
668  Value *Acc = R.getAccumulator();
669 
670  // For any muls that were discovered but not paired, accumulate their values
671  // as before.
672  IRBuilder<NoFolder> Builder(R.getRoot()->getParent());
673  MulCandList &MulCands = R.getMuls();
674  for (auto &MulCand : MulCands) {
675  if (MulCand->Paired)
676  continue;
677 
678  Instruction *Mul = cast<Instruction>(MulCand->Root);
679  LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n");
680 
681  if (R.getType() != Mul->getType()) {
682  assert(R.is64Bit() && "expected 64-bit result");
683  Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul));
684  Mul = cast<Instruction>(Builder.CreateSExt(Mul, R.getRoot()->getType()));
685  }
686 
687  if (!Acc) {
688  Acc = Mul;
689  continue;
690  }
691 
692  // If Acc is the original incoming value to the reduction, it could be a
693  // phi. But the phi will dominate Mul, meaning that Mul will be the
694  // insertion point.
695  Builder.SetInsertPoint(GetInsertPoint(Mul, Acc));
696  Acc = Builder.CreateAdd(Mul, Acc);
697  }
698 
699  if (!Acc) {
700  Acc = R.is64Bit() ?
701  ConstantInt::get(IntegerType::get(M->getContext(), 64), 0) :
702  ConstantInt::get(IntegerType::get(M->getContext(), 32), 0);
703  } else if (Acc->getType() != R.getType()) {
704  Builder.SetInsertPoint(R.getRoot());
705  Acc = Builder.CreateSExt(Acc, R.getType());
706  }
707 
708  // Roughly sort the mul pairs in their program order.
709  llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
710  const Instruction *A = PairA.first->Root;
711  const Instruction *B = PairB.first->Root;
712  return A->comesBefore(B);
713  });
714 
715  IntegerType *Ty = IntegerType::get(M->getContext(), 32);
716  for (auto &Pair : R.getMulPairs()) {
717  MulCandidate *LHSMul = Pair.first;
718  MulCandidate *RHSMul = Pair.second;
719  LoadInst *BaseLHS = LHSMul->getBaseLoad();
720  LoadInst *BaseRHS = RHSMul->getBaseLoad();
721  LoadInst *WideLHS = WideLoads.count(BaseLHS) ?
722  WideLoads[BaseLHS]->getLoad() : CreateWideLoad(LHSMul->VecLd, Ty);
723  LoadInst *WideRHS = WideLoads.count(BaseRHS) ?
724  WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty);
725 
726  Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS);
727  InsertAfter = GetInsertPoint(InsertAfter, Acc);
728  Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter);
729  }
730  R.UpdateRoot(cast<Instruction>(Acc));
731 }
732 
733 LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
734  IntegerType *LoadTy) {
735  assert(Loads.size() == 2 && "currently only support widening two loads");
736 
737  LoadInst *Base = Loads[0];
738  LoadInst *Offset = Loads[1];
739 
740  Instruction *BaseSExt = dyn_cast<SExtInst>(Base->user_back());
741  Instruction *OffsetSExt = dyn_cast<SExtInst>(Offset->user_back());
742 
743  assert((BaseSExt && OffsetSExt)
744  && "Loads should have a single, extending, user");
745 
746  std::function<void(Value*, Value*)> MoveBefore =
747  [&](Value *A, Value *B) -> void {
748  if (!isa<Instruction>(A) || !isa<Instruction>(B))
749  return;
750 
751  auto *Source = cast<Instruction>(A);
752  auto *Sink = cast<Instruction>(B);
753 
754  if (DT->dominates(Source, Sink) ||
755  Source->getParent() != Sink->getParent() ||
756  isa<PHINode>(Source) || isa<PHINode>(Sink))
757  return;
758 
759  Source->moveBefore(Sink);
760  for (auto &Op : Source->operands())
761  MoveBefore(Op, Source);
762  };
763 
764  // Insert the load at the point of the original dominating load.
765  LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset;
766  IRBuilder<NoFolder> IRB(DomLoad->getParent(),
767  ++BasicBlock::iterator(DomLoad));
768 
769  // Bitcast the pointer to a wider type and create the wide load, while making
770  // sure to maintain the original alignment as this prevents ldrd from being
771  // generated when it could be illegal due to memory alignment.
772  const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
773  Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
774  LoadTy->getPointerTo(AddrSpace));
775  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
776 
777  // Make sure everything is in the correct order in the basic block.
778  MoveBefore(Base->getPointerOperand(), VecPtr);
779  MoveBefore(VecPtr, WideLoad);
780 
781  // From the wide load, create two values that equal the original two loads.
782  // Loads[0] needs trunc while Loads[1] needs a lshr and trunc.
783  // TODO: Support big-endian as well.
784  Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType());
785  Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType());
786  BaseSExt->replaceAllUsesWith(NewBaseSExt);
787 
788  IntegerType *OffsetTy = cast<IntegerType>(Offset->getType());
789  Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth());
790  Value *Top = IRB.CreateLShr(WideLoad, ShiftVal);
791  Value *Trunc = IRB.CreateTrunc(Top, OffsetTy);
792  Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType());
793  OffsetSExt->replaceAllUsesWith(NewOffsetSExt);
794 
795  LLVM_DEBUG(dbgs() << "From Base and Offset:\n"
796  << *Base << "\n" << *Offset << "\n"
797  << "Created Wide Load:\n"
798  << *WideLoad << "\n"
799  << *Bottom << "\n"
800  << *NewBaseSExt << "\n"
801  << *Top << "\n"
802  << *Trunc << "\n"
803  << *NewOffsetSExt << "\n");
804  WideLoads.emplace(std::make_pair(Base,
805  std::make_unique<WidenedLoad>(Loads, WideLoad)));
806  return WideLoad;
807 }
808 
810  return new ARMParallelDSP();
811 }
812 
813 char ARMParallelDSP::ID = 0;
814 
815 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
816  "Transform functions to use DSP intrinsics", false, false)
817 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
818  "Transform functions to use DSP intrinsics", false, false)
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
ARMSubtarget.h
use
Move duplicate certain instructions close to their use
Definition: Localizer.cpp:32
functions
amdgpu propagate attributes Late propagate attributes from kernels to functions
Definition: AMDGPUPropagateAttributes.cpp:196
AssumptionCache.h
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
NoFolder.h
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1418
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::isModOrRefSet
LLVM_NODISCARD bool isModOrRefSet(const ModRefInfo MRI)
Definition: AliasAnalysis.h:189
llvm::ARMSubtarget
Definition: ARMSubtarget.h:47
Scalar.h
llvm::Function
Definition: Function.h:60
llvm::Value::dump
void dump() const
Support for debugging, callable in GDB: V->dump()
Definition: AsmWriter.cpp:4833
Pass.h
is64Bit
static bool is64Bit(const char *name)
Definition: X86Disassembler.cpp:1018
NumLoadLimit
static cl::opt< unsigned > NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), cl::desc("Limit the number of loads analysed"))
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
Statistic.h
LoopAccessAnalysis.h
llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2495
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
GlobalsModRef.h
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
to
Should compile to
Definition: README.txt:449
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:380
llvm::Instruction::comesBefore
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
Definition: Instruction.cpp:110
dsp
arm parallel dsp
Definition: ARMParallelDSP.cpp:817
llvm::isConsecutiveAccess
bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType=true)
Returns true if the memory operations A and B are consecutive.
Definition: LoopAccessAnalysis.cpp:1409
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
PassRegistry.h
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:240
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::DominatorTree::dominates
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Intrinsic::getType
FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys=None)
Return the function type for an intrinsic.
Definition: Function.cpp:1374
llvm::AAResults
Definition: AliasAnalysis.h:511
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
TargetLibraryInfo.h
false
Definition: StackSlotColoring.cpp:141
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::Instruction
Definition: Instruction.h:42
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:302
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:928
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP
llvm::ScalarEvolutionWrapperPass
Definition: ScalarEvolution.h:2145
SmallPtrSet.h
PatternMatch.h
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:155
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::codeview::ClassOptions::Intrinsic
@ Intrinsic
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:191
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:266
llvm::pdb::OMFSegDescFlags::Read
@ Read
llvm::AMDGPU::Hwreg::Offset
Offset
Definition: SIDefines.h:413
llvm::TargetLibraryInfoWrapperPass
Definition: TargetLibraryInfo.h:468
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
intrinsics
arm parallel Transform functions to use DSP intrinsics
Definition: ARMParallelDSP.cpp:818
TargetPassConfig.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
function
print Print MemDeps of function
Definition: MemDepPrinter.cpp:82
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
ARM.h
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::SetVector::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::Sched::Source
@ Source
Definition: TargetLowering.h:99
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::cl::Sink
@ Sink
Definition: CommandLine.h:167
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:529
llvm::append_range
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition: STLExtras.h:1823
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::createARMParallelDSPPass
Pass * createARMParallelDSPPass()
Definition: ARMParallelDSP.cpp:809
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:173
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
get
Should compile to something r4 addze r3 instead we get
Definition: README.txt:24
llvm::LocationSize::beforeOrAfterPointer
constexpr static LocationSize beforeOrAfterPointer()
Any location before or after the base pointer (but still within the underlying object).
Definition: MemoryLocation.h:130
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:345
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1562
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
llvm::TargetLibraryInfo
Provides information about what library functions are available for the current target.
Definition: TargetLibraryInfo.h:222
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:591
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
AA
llvm::Pass
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:91
Instructions.h
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1351
DisableParallelDSP
static cl::opt< bool > DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass"))
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::GlobalsAAWrapperPass
Legacy wrapper pass to provide the GlobalsAAResult object.
Definition: GlobalsModRef.h:148
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
llvm::GlobalValue::getType
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:276
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:97
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::IntegerType::get
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::pdb::OMFSegDescFlags::Write
@ Write
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
llvm::cl::desc
Definition: CommandLine.h:405
llvm::SetVector
A vector that has set insertion semantics.
Definition: SetVector.h:40
BasicBlockUtils.h
llvm::intersectModRef
LLVM_NODISCARD ModRefInfo intersectModRef(const ModRefInfo MRI1, const ModRefInfo MRI2)
Definition: AliasAnalysis.h:236
Reduction
loop Loop Strength Reduction
Definition: LoopStrengthReduce.cpp:6711
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
Debug.h
llvm::ModRefInfo::ModRef
@ ModRef
The access may reference and may modify the value stored in memory.
llvm::MemoryLocation
Representation for a specific memory location.
Definition: MemoryLocation.h:210
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38