LLVM  15.0.0git
InterleavedLoadCombinePass.cpp
Go to the documentation of this file.
1 //===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 //
11 // This file defines the interleaved-load-combine pass. The pass searches for
12 // ShuffleVectorInstruction that execute interleaving loads. If a matching
13 // pattern is found, it adds a combined load and further instructions in a
14 // pattern that is detectable by InterleavedAccesPass. The old instructions are
15 // left dead to be removed later. The pass is specifically designed to be
16 // executed just before InterleavedAccesPass to find any left-over instances
17 // that are not detected within former passes.
18 //
19 //===----------------------------------------------------------------------===//
20 
21 #include "llvm/ADT/Statistic.h"
26 #include "llvm/CodeGen/Passes.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/Module.h"
36 #include "llvm/InitializePasses.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Debug.h"
42 
43 #include <algorithm>
44 #include <cassert>
45 #include <list>
46 
47 using namespace llvm;
48 
49 #define DEBUG_TYPE "interleaved-load-combine"
50 
51 namespace {
52 
53 /// Statistic counter
54 STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
55 
56 /// Option to disable the pass
57 static cl::opt<bool> DisableInterleavedLoadCombine(
58  "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
59  cl::desc("Disable combining of interleaved loads"));
60 
61 struct VectorInfo;
62 
63 struct InterleavedLoadCombineImpl {
64 public:
65  InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
67  : F(F), DT(DT), MSSA(MSSA),
68  TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
69  TTI(TM.getTargetTransformInfo(F)) {}
70 
71  /// Scan the function for interleaved load candidates and execute the
72  /// replacement if applicable.
73  bool run();
74 
75 private:
76  /// Function this pass is working on
77  Function &F;
78 
79  /// Dominator Tree Analysis
80  DominatorTree &DT;
81 
82  /// Memory Alias Analyses
83  MemorySSA &MSSA;
84 
85  /// Target Lowering Information
86  const TargetLowering &TLI;
87 
88  /// Target Transform Information
90 
91  /// Find the instruction in sets LIs that dominates all others, return nullptr
92  /// if there is none.
93  LoadInst *findFirstLoad(const std::set<LoadInst *> &LIs);
94 
95  /// Replace interleaved load candidates. It does additional
96  /// analyses if this makes sense. Returns true on success and false
97  /// of nothing has been changed.
98  bool combine(std::list<VectorInfo> &InterleavedLoad,
100 
101  /// Given a set of VectorInfo containing candidates for a given interleave
102  /// factor, find a set that represents a 'factor' interleaved load.
103  bool findPattern(std::list<VectorInfo> &Candidates,
104  std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
105  const DataLayout &DL);
106 }; // InterleavedLoadCombine
107 
108 /// First Order Polynomial on an n-Bit Integer Value
109 ///
110 /// Polynomial(Value) = Value * B + A + E*2^(n-e)
111 ///
112 /// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
113 /// significant bits. It is introduced if an exact computation cannot be proven
114 /// (e.q. division by 2).
115 ///
116 /// As part of this optimization multiple loads will be combined. It necessary
117 /// to prove that loads are within some relative offset to each other. This
118 /// class is used to prove relative offsets of values loaded from memory.
119 ///
120 /// Representing an integer in this form is sound since addition in two's
121 /// complement is associative (trivial) and multiplication distributes over the
122 /// addition (see Proof(1) in Polynomial::mul). Further, both operations
123 /// commute.
124 //
125 // Example:
126 // declare @fn(i64 %IDX, <4 x float>* %PTR) {
127 // %Pa1 = add i64 %IDX, 2
128 // %Pa2 = lshr i64 %Pa1, 1
129 // %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
130 // %Va = load <4 x float>, <4 x float>* %Pa3
131 //
132 // %Pb1 = add i64 %IDX, 4
133 // %Pb2 = lshr i64 %Pb1, 1
134 // %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
135 // %Vb = load <4 x float>, <4 x float>* %Pb3
136 // ... }
137 //
138 // The goal is to prove that two loads load consecutive addresses.
139 //
140 // In this case the polynomials are constructed by the following
141 // steps.
142 //
143 // The number tag #e specifies the error bits.
144 //
145 // Pa_0 = %IDX #0
146 // Pa_1 = %IDX + 2 #0 | add 2
147 // Pa_2 = %IDX/2 + 1 #1 | lshr 1
148 // Pa_3 = %IDX/2 + 1 #1 | GEP, step signext to i64
149 // Pa_4 = (%IDX/2)*16 + 16 #0 | GEP, multiply index by sizeof(4) for floats
150 // Pa_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
151 //
152 // Pb_0 = %IDX #0
153 // Pb_1 = %IDX + 4 #0 | add 2
154 // Pb_2 = %IDX/2 + 2 #1 | lshr 1
155 // Pb_3 = %IDX/2 + 2 #1 | GEP, step signext to i64
156 // Pb_4 = (%IDX/2)*16 + 32 #0 | GEP, multiply index by sizeof(4) for floats
157 // Pb_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
158 //
159 // Pb_5 - Pa_5 = 16 #0 | subtract to get the offset
160 //
161 // Remark: %PTR is not maintained within this class. So in this instance the
162 // offset of 16 can only be assumed if the pointers are equal.
163 //
164 class Polynomial {
165  /// Operations on B
166  enum BOps {
167  LShr,
168  Mul,
169  SExt,
170  Trunc,
171  };
172 
173  /// Number of Error Bits e
174  unsigned ErrorMSBs;
175 
176  /// Value
177  Value *V;
178 
179  /// Coefficient B
181 
182  /// Coefficient A
183  APInt A;
184 
185 public:
186  Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V) {
187  IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
188  if (Ty) {
189  ErrorMSBs = 0;
190  this->V = V;
191  A = APInt(Ty->getBitWidth(), 0);
192  }
193  }
194 
195  Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
196  : ErrorMSBs(ErrorMSBs), V(nullptr), A(A) {}
197 
198  Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
199  : ErrorMSBs(ErrorMSBs), V(nullptr), A(BitWidth, A) {}
200 
201  Polynomial() : ErrorMSBs((unsigned)-1), V(nullptr) {}
202 
203  /// Increment and clamp the number of undefined bits.
204  void incErrorMSBs(unsigned amt) {
205  if (ErrorMSBs == (unsigned)-1)
206  return;
207 
208  ErrorMSBs += amt;
209  if (ErrorMSBs > A.getBitWidth())
210  ErrorMSBs = A.getBitWidth();
211  }
212 
213  /// Decrement and clamp the number of undefined bits.
214  void decErrorMSBs(unsigned amt) {
215  if (ErrorMSBs == (unsigned)-1)
216  return;
217 
218  if (ErrorMSBs > amt)
219  ErrorMSBs -= amt;
220  else
221  ErrorMSBs = 0;
222  }
223 
224  /// Apply an add on the polynomial
225  Polynomial &add(const APInt &C) {
226  // Note: Addition is associative in two's complement even when in case of
227  // signed overflow.
228  //
229  // Error bits can only propagate into higher significant bits. As these are
230  // already regarded as undefined, there is no change.
231  //
232  // Theorem: Adding a constant to a polynomial does not change the error
233  // term.
234  //
235  // Proof:
236  //
237  // Since the addition is associative and commutes:
238  //
239  // (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)
240  // [qed]
241 
242  if (C.getBitWidth() != A.getBitWidth()) {
243  ErrorMSBs = (unsigned)-1;
244  return *this;
245  }
246 
247  A += C;
248  return *this;
249  }
250 
251  /// Apply a multiplication onto the polynomial.
252  Polynomial &mul(const APInt &C) {
253  // Note: Multiplication distributes over the addition
254  //
255  // Theorem: Multiplication distributes over the addition
256  //
257  // Proof(1):
258  //
259  // (B+A)*C =-
260  // = (B + A) + (B + A) + .. {C Times}
261  // addition is associative and commutes, hence
262  // = B + B + .. {C Times} .. + A + A + .. {C times}
263  // = B*C + A*C
264  // (see (function add) for signed values and overflows)
265  // [qed]
266  //
267  // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
268  // to the left.
269  //
270  // Proof(2):
271  //
272  // Let B' and A' be the n-Bit inputs with some unknown errors EA,
273  // EB at e leading bits. B' and A' can be written down as:
274  //
275  // B' = B + 2^(n-e)*EB
276  // A' = A + 2^(n-e)*EA
277  //
278  // Let C' be an input with c trailing zero bits. C' can be written as
279  //
280  // C' = C*2^c
281  //
282  // Therefore we can compute the result by using distributivity and
283  // commutativity.
284  //
285  // (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =
286  // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
287  // = (B'+A') * C' =
288  // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
289  // = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =
290  // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =
291  // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =
292  // = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =
293  //
294  // Let EC be the final error with EC = C*(EB + EA)
295  //
296  // = (B + A)*C' + EC*2^(n-e)*2^c =
297  // = (B + A)*C' + EC*2^(n-(e-c))
298  //
299  // Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
300  // less error bits than the input. c bits are shifted out to the left.
301  // [qed]
302 
303  if (C.getBitWidth() != A.getBitWidth()) {
304  ErrorMSBs = (unsigned)-1;
305  return *this;
306  }
307 
308  // Multiplying by one is a no-op.
309  if (C.isOne()) {
310  return *this;
311  }
312 
313  // Multiplying by zero removes the coefficient B and defines all bits.
314  if (C.isZero()) {
315  ErrorMSBs = 0;
316  deleteB();
317  }
318 
319  // See Proof(2): Trailing zero bits indicate a left shift. This removes
320  // leading bits from the result even if they are undefined.
321  decErrorMSBs(C.countTrailingZeros());
322 
323  A *= C;
324  pushBOperation(Mul, C);
325  return *this;
326  }
327 
328  /// Apply a logical shift right on the polynomial
329  Polynomial &lshr(const APInt &C) {
330  // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')
331  // where
332  // e' = e + 1,
333  // E is a e-bit number,
334  // E' is a e'-bit number,
335  // holds under the following precondition:
336  // pre(1): A % 2 = 0
337  // pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
338  // where >> expresses a logical shift to the right, with adding zeros.
339  //
340  // We need to show that for every, E there is a E'
341  //
342  // B = b_h * 2^(n-1) + b_m * 2 + b_l
343  // A = a_h * 2^(n-1) + a_m * 2 (pre(1))
344  //
345  // where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
346  //
347  // Let X = (B + A + E*2^(n-e)) >> 1
348  // Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
349  //
350  // X = [B + A + E*2^(n-e)] >> 1 =
351  // = [ b_h * 2^(n-1) + b_m * 2 + b_l +
352  // + a_h * 2^(n-1) + a_m * 2 +
353  // + E * 2^(n-e) ] >> 1 =
354  //
355  // The sum is built by putting the overflow of [a_m + b+n] into the term
356  // 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
357  // this bit is discarded. This is expressed by % 2.
358  //
359  // The bit in position 0 cannot overflow into the term (b_m + a_m).
360  //
361  // = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
362  // + ((b_m + a_m) % 2^(n-2)) * 2 +
363  // + b_l + E * 2^(n-e) ] >> 1 =
364  //
365  // The shift is computed by dividing the terms by 2 and by cutting off
366  // b_l.
367  //
368  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
369  // + ((b_m + a_m) % 2^(n-2)) +
370  // + E * 2^(n-(e+1)) =
371  //
372  // by the definition in the Theorem e+1 = e'
373  //
374  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
375  // + ((b_m + a_m) % 2^(n-2)) +
376  // + E * 2^(n-e') =
377  //
378  // Compute Y by applying distributivity first
379  //
380  // Y = (B >> 1) + (A >> 1) + E*2^(n-e') =
381  // = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
382  // + (a_h * 2^(n-1) + a_m * 2) >> 1 +
383  // + E * 2^(n-e) >> 1 =
384  //
385  // Again, the shift is computed by dividing the terms by 2 and by cutting
386  // off b_l.
387  //
388  // = b_h * 2^(n-2) + b_m +
389  // + a_h * 2^(n-2) + a_m +
390  // + E * 2^(n-(e+1)) =
391  //
392  // Again, the sum is built by putting the overflow of [a_m + b+n] into
393  // the term 2^(n-1). But this time there is room for a second bit in the
394  // term 2^(n-2) we add this bit to a new term and denote it o_h in a
395  // second step.
396  //
397  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
398  // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
399  // + ((b_m + a_m) % 2^(n-2)) +
400  // + E * 2^(n-(e+1)) =
401  //
402  // Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
403  // Further replace e+1 by e'.
404  //
405  // = o_h * 2^(n-1) +
406  // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
407  // + ((b_m + a_m) % 2^(n-2)) +
408  // + E * 2^(n-e') =
409  //
410  // Move o_h into the error term and construct E'. To ensure that there is
411  // no 2^x with negative x, this step requires pre(2) (e < n).
412  //
413  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
414  // + ((b_m + a_m) % 2^(n-2)) +
415  // + o_h * 2^(e'-1) * 2^(n-e') + | pre(2), move 2^(e'-1)
416  // | out of the old exponent
417  // + E * 2^(n-e') =
418  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
419  // + ((b_m + a_m) % 2^(n-2)) +
420  // + [o_h * 2^(e'-1) + E] * 2^(n-e') + | move 2^(e'-1) out of
421  // | the old exponent
422  //
423  // Let E' = o_h * 2^(e'-1) + E
424  //
425  // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
426  // + ((b_m + a_m) % 2^(n-2)) +
427  // + E' * 2^(n-e')
428  //
429  // Because X and Y are distinct only in there error terms and E' can be
430  // constructed as shown the theorem holds.
431  // [qed]
432  //
433  // For completeness in case of the case e=n it is also required to show that
434  // distributivity can be applied.
435  //
436  // In this case Theorem(1) transforms to (the pre-condition on A can also be
437  // dropped)
438  //
439  // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
440  // where
441  // A, B, E, E' are two's complement numbers with the same bit
442  // width
443  //
444  // Let A + B + E = X
445  // Let (B >> 1) + (A >> 1) = Y
446  //
447  // Therefore we need to show that for every X and Y there is an E' which
448  // makes the equation
449  //
450  // X = Y + E'
451  //
452  // hold. This is trivially the case for E' = X - Y.
453  //
454  // [qed]
455  //
456  // Remark: Distributing lshr with and arbitrary number n can be expressed as
457  // ((((B + A) lshr 1) lshr 1) ... ) {n times}.
458  // This construction induces n additional error bits at the left.
459 
460  if (C.getBitWidth() != A.getBitWidth()) {
461  ErrorMSBs = (unsigned)-1;
462  return *this;
463  }
464 
465  if (C.isZero())
466  return *this;
467 
468  // Test if the result will be zero
469  unsigned shiftAmt = C.getZExtValue();
470  if (shiftAmt >= C.getBitWidth())
471  return mul(APInt(C.getBitWidth(), 0));
472 
473  // The proof that shiftAmt LSBs are zero for at least one summand is only
474  // possible for the constant number.
475  //
476  // If this can be proven add shiftAmt to the error counter
477  // `ErrorMSBs`. Otherwise set all bits as undefined.
478  if (A.countTrailingZeros() < shiftAmt)
479  ErrorMSBs = A.getBitWidth();
480  else
481  incErrorMSBs(shiftAmt);
482 
483  // Apply the operation.
484  pushBOperation(LShr, C);
485  A = A.lshr(shiftAmt);
486 
487  return *this;
488  }
489 
490  /// Apply a sign-extend or truncate operation on the polynomial.
491  Polynomial &sextOrTrunc(unsigned n) {
492  if (n < A.getBitWidth()) {
493  // Truncate: Clearly undefined Bits on the MSB side are removed
494  // if there are any.
495  decErrorMSBs(A.getBitWidth() - n);
496  A = A.trunc(n);
497  pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
498  }
499  if (n > A.getBitWidth()) {
500  // Extend: Clearly extending first and adding later is different
501  // to adding first and extending later in all extended bits.
502  incErrorMSBs(n - A.getBitWidth());
503  A = A.sext(n);
504  pushBOperation(SExt, APInt(sizeof(n) * 8, n));
505  }
506 
507  return *this;
508  }
509 
510  /// Test if there is a coefficient B.
511  bool isFirstOrder() const { return V != nullptr; }
512 
513  /// Test coefficient B of two Polynomials are equal.
514  bool isCompatibleTo(const Polynomial &o) const {
515  // The polynomial use different bit width.
516  if (A.getBitWidth() != o.A.getBitWidth())
517  return false;
518 
519  // If neither Polynomial has the Coefficient B.
520  if (!isFirstOrder() && !o.isFirstOrder())
521  return true;
522 
523  // The index variable is different.
524  if (V != o.V)
525  return false;
526 
527  // Check the operations.
528  if (B.size() != o.B.size())
529  return false;
530 
531  auto ob = o.B.begin();
532  for (auto &b : B) {
533  if (b != *ob)
534  return false;
535  ob++;
536  }
537 
538  return true;
539  }
540 
541  /// Subtract two polynomials, return an undefined polynomial if
542  /// subtraction is not possible.
543  Polynomial operator-(const Polynomial &o) const {
544  // Return an undefined polynomial if incompatible.
545  if (!isCompatibleTo(o))
546  return Polynomial();
547 
548  // If the polynomials are compatible (meaning they have the same
549  // coefficient on B), B is eliminated. Thus a polynomial solely
550  // containing A is returned
551  return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
552  }
553 
554  /// Subtract a constant from a polynomial,
555  Polynomial operator-(uint64_t C) const {
556  Polynomial Result(*this);
557  Result.A -= C;
558  return Result;
559  }
560 
561  /// Add a constant to a polynomial,
562  Polynomial operator+(uint64_t C) const {
563  Polynomial Result(*this);
564  Result.A += C;
565  return Result;
566  }
567 
568  /// Returns true if it can be proven that two Polynomials are equal.
569  bool isProvenEqualTo(const Polynomial &o) {
570  // Subtract both polynomials and test if it is fully defined and zero.
571  Polynomial r = *this - o;
572  return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
573  }
574 
575  /// Print the polynomial into a stream.
576  void print(raw_ostream &OS) const {
577  OS << "[{#ErrBits:" << ErrorMSBs << "} ";
578 
579  if (V) {
580  for (auto b : B)
581  OS << "(";
582  OS << "(" << *V << ") ";
583 
584  for (auto b : B) {
585  switch (b.first) {
586  case LShr:
587  OS << "LShr ";
588  break;
589  case Mul:
590  OS << "Mul ";
591  break;
592  case SExt:
593  OS << "SExt ";
594  break;
595  case Trunc:
596  OS << "Trunc ";
597  break;
598  }
599 
600  OS << b.second << ") ";
601  }
602  }
603 
604  OS << "+ " << A << "]";
605  }
606 
607 private:
608  void deleteB() {
609  V = nullptr;
610  B.clear();
611  }
612 
613  void pushBOperation(const BOps Op, const APInt &C) {
614  if (isFirstOrder()) {
615  B.push_back(std::make_pair(Op, C));
616  return;
617  }
618  }
619 };
620 
621 #ifndef NDEBUG
622 static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
623  S.print(OS);
624  return OS;
625 }
626 #endif
627 
628 /// VectorInfo stores abstract the following information for each vector
629 /// element:
630 ///
631 /// 1) The the memory address loaded into the element as Polynomial
632 /// 2) a set of load instruction necessary to construct the vector,
633 /// 3) a set of all other instructions that are necessary to create the vector and
634 /// 4) a pointer value that can be used as relative base for all elements.
635 struct VectorInfo {
636 private:
637  VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
639  "Copying VectorInfo is neither implemented nor necessary,");
640  }
641 
642 public:
643  /// Information of a Vector Element
644  struct ElementInfo {
645  /// Offset Polynomial.
646  Polynomial Ofs;
647 
648  /// The Load Instruction used to Load the entry. LI is null if the pointer
649  /// of the load instruction does not point on to the entry
650  LoadInst *LI;
651 
652  ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
653  : Ofs(Offset), LI(LI) {}
654  };
655 
656  /// Basic-block the load instructions are within
657  BasicBlock *BB = nullptr;
658 
659  /// Pointer value of all participation load instructions
660  Value *PV = nullptr;
661 
662  /// Participating load instructions
663  std::set<LoadInst *> LIs;
664 
665  /// Participating instructions
666  std::set<Instruction *> Is;
667 
668  /// Final shuffle-vector instruction
669  ShuffleVectorInst *SVI = nullptr;
670 
671  /// Information of the offset for each vector element
672  ElementInfo *EI;
673 
674  /// Vector Type
675  FixedVectorType *const VTy;
676 
677  VectorInfo(FixedVectorType *VTy) : VTy(VTy) {
678  EI = new ElementInfo[VTy->getNumElements()];
679  }
680 
681  virtual ~VectorInfo() { delete[] EI; }
682 
683  unsigned getDimension() const { return VTy->getNumElements(); }
684 
685  /// Test if the VectorInfo can be part of an interleaved load with the
686  /// specified factor.
687  ///
688  /// \param Factor of the interleave
689  /// \param DL Targets Datalayout
690  ///
691  /// \returns true if this is possible and false if not
692  bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
693  unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
694  for (unsigned i = 1; i < getDimension(); i++) {
695  if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
696  return false;
697  }
698  }
699  return true;
700  }
701 
702  /// Recursively computes the vector information stored in V.
703  ///
704  /// This function delegates the work to specialized implementations
705  ///
706  /// \param V Value to operate on
707  /// \param Result Result of the computation
708  ///
709  /// \returns false if no sensible information can be gathered.
710  static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
711  ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
712  if (SVI)
713  return computeFromSVI(SVI, Result, DL);
714  LoadInst *LI = dyn_cast<LoadInst>(V);
715  if (LI)
716  return computeFromLI(LI, Result, DL);
717  BitCastInst *BCI = dyn_cast<BitCastInst>(V);
718  if (BCI)
719  return computeFromBCI(BCI, Result, DL);
720  return false;
721  }
722 
723  /// BitCastInst specialization to compute the vector information.
724  ///
725  /// \param BCI BitCastInst to operate on
726  /// \param Result Result of the computation
727  ///
728  /// \returns false if no sensible information can be gathered.
729  static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
730  const DataLayout &DL) {
731  Instruction *Op = dyn_cast<Instruction>(BCI->getOperand(0));
732 
733  if (!Op)
734  return false;
735 
736  FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType());
737  if (!VTy)
738  return false;
739 
740  // We can only cast from large to smaller vectors
741  if (Result.VTy->getNumElements() % VTy->getNumElements())
742  return false;
743 
744  unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
745  unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
746  unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
747 
748  if (NewSize * Factor != OldSize)
749  return false;
750 
751  VectorInfo Old(VTy);
752  if (!compute(Op, Old, DL))
753  return false;
754 
755  for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
756  for (unsigned j = 0; j < Factor; j++) {
757  Result.EI[i + j] =
758  ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
759  j == 0 ? Old.EI[i / Factor].LI : nullptr);
760  }
761  }
762 
763  Result.BB = Old.BB;
764  Result.PV = Old.PV;
765  Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
766  Result.Is.insert(Old.Is.begin(), Old.Is.end());
767  Result.Is.insert(BCI);
768  Result.SVI = nullptr;
769 
770  return true;
771  }
772 
773  /// ShuffleVectorInst specialization to compute vector information.
774  ///
775  /// \param SVI ShuffleVectorInst to operate on
776  /// \param Result Result of the computation
777  ///
778  /// Compute the left and the right side vector information and merge them by
779  /// applying the shuffle operation. This function also ensures that the left
780  /// and right side have compatible loads. This means that all loads are with
781  /// in the same basic block and are based on the same pointer.
782  ///
783  /// \returns false if no sensible information can be gathered.
784  static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
785  const DataLayout &DL) {
786  FixedVectorType *ArgTy =
787  cast<FixedVectorType>(SVI->getOperand(0)->getType());
788 
789  // Compute the left hand vector information.
790  VectorInfo LHS(ArgTy);
791  if (!compute(SVI->getOperand(0), LHS, DL))
792  LHS.BB = nullptr;
793 
794  // Compute the right hand vector information.
795  VectorInfo RHS(ArgTy);
796  if (!compute(SVI->getOperand(1), RHS, DL))
797  RHS.BB = nullptr;
798 
799  // Neither operand produced sensible results?
800  if (!LHS.BB && !RHS.BB)
801  return false;
802  // Only RHS produced sensible results?
803  else if (!LHS.BB) {
804  Result.BB = RHS.BB;
805  Result.PV = RHS.PV;
806  }
807  // Only LHS produced sensible results?
808  else if (!RHS.BB) {
809  Result.BB = LHS.BB;
810  Result.PV = LHS.PV;
811  }
812  // Both operands produced sensible results?
813  else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
814  Result.BB = LHS.BB;
815  Result.PV = LHS.PV;
816  }
817  // Both operands produced sensible results but they are incompatible.
818  else {
819  return false;
820  }
821 
822  // Merge and apply the operation on the offset information.
823  if (LHS.BB) {
824  Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
825  Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
826  }
827  if (RHS.BB) {
828  Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
829  Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
830  }
831  Result.Is.insert(SVI);
832  Result.SVI = SVI;
833 
834  int j = 0;
835  for (int i : SVI->getShuffleMask()) {
836  assert((i < 2 * (signed)ArgTy->getNumElements()) &&
837  "Invalid ShuffleVectorInst (index out of bounds)");
838 
839  if (i < 0)
840  Result.EI[j] = ElementInfo();
841  else if (i < (signed)ArgTy->getNumElements()) {
842  if (LHS.BB)
843  Result.EI[j] = LHS.EI[i];
844  else
845  Result.EI[j] = ElementInfo();
846  } else {
847  if (RHS.BB)
848  Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
849  else
850  Result.EI[j] = ElementInfo();
851  }
852  j++;
853  }
854 
855  return true;
856  }
857 
858  /// LoadInst specialization to compute vector information.
859  ///
860  /// This function also acts as abort condition to the recursion.
861  ///
862  /// \param LI LoadInst to operate on
863  /// \param Result Result of the computation
864  ///
865  /// \returns false if no sensible information can be gathered.
866  static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
867  const DataLayout &DL) {
868  Value *BasePtr;
869  Polynomial Offset;
870 
871  if (LI->isVolatile())
872  return false;
873 
874  if (LI->isAtomic())
875  return false;
876 
877  // Get the base polynomial
878  computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
879 
880  Result.BB = LI->getParent();
881  Result.PV = BasePtr;
882  Result.LIs.insert(LI);
883  Result.Is.insert(LI);
884 
885  for (unsigned i = 0; i < Result.getDimension(); i++) {
886  Value *Idx[2] = {
889  };
890  int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2));
891  Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
892  }
893 
894  return true;
895  }
896 
897  /// Recursively compute polynomial of a value.
898  ///
899  /// \param BO Input binary operation
900  /// \param Result Result polynomial
901  static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
902  Value *LHS = BO.getOperand(0);
903  Value *RHS = BO.getOperand(1);
904 
905  // Find the RHS Constant if any
906  ConstantInt *C = dyn_cast<ConstantInt>(RHS);
907  if ((!C) && BO.isCommutative()) {
908  C = dyn_cast<ConstantInt>(LHS);
909  if (C)
910  std::swap(LHS, RHS);
911  }
912 
913  switch (BO.getOpcode()) {
914  case Instruction::Add:
915  if (!C)
916  break;
917 
918  computePolynomial(*LHS, Result);
919  Result.add(C->getValue());
920  return;
921 
922  case Instruction::LShr:
923  if (!C)
924  break;
925 
926  computePolynomial(*LHS, Result);
927  Result.lshr(C->getValue());
928  return;
929 
930  default:
931  break;
932  }
933 
934  Result = Polynomial(&BO);
935  }
936 
937  /// Recursively compute polynomial of a value
938  ///
939  /// \param V input value
940  /// \param Result result polynomial
941  static void computePolynomial(Value &V, Polynomial &Result) {
942  if (auto *BO = dyn_cast<BinaryOperator>(&V))
943  computePolynomialBinOp(*BO, Result);
944  else
945  Result = Polynomial(&V);
946  }
947 
948  /// Compute the Polynomial representation of a Pointer type.
949  ///
950  /// \param Ptr input pointer value
951  /// \param Result result polynomial
952  /// \param BasePtr pointer the polynomial is based on
953  /// \param DL Datalayout of the target machine
954  static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
955  Value *&BasePtr,
956  const DataLayout &DL) {
957  // Not a pointer type? Return an undefined polynomial
958  PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
959  if (!PtrTy) {
960  Result = Polynomial();
961  BasePtr = nullptr;
962  return;
963  }
964  unsigned PointerBits =
965  DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
966 
967  /// Skip pointer casts. Return Zero polynomial otherwise
968  if (isa<CastInst>(&Ptr)) {
969  CastInst &CI = *cast<CastInst>(&Ptr);
970  switch (CI.getOpcode()) {
971  case Instruction::BitCast:
972  computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
973  break;
974  default:
975  BasePtr = &Ptr;
976  Polynomial(PointerBits, 0);
977  break;
978  }
979  }
980  /// Resolve GetElementPtrInst.
981  else if (isa<GetElementPtrInst>(&Ptr)) {
982  GetElementPtrInst &GEP = *cast<GetElementPtrInst>(&Ptr);
983 
984  APInt BaseOffset(PointerBits, 0);
985 
986  // Check if we can compute the Offset with accumulateConstantOffset
987  if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
988  Result = Polynomial(BaseOffset);
989  BasePtr = GEP.getPointerOperand();
990  return;
991  } else {
992  // Otherwise we allow that the last index operand of the GEP is
993  // non-constant.
994  unsigned idxOperand, e;
995  SmallVector<Value *, 4> Indices;
996  for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
997  idxOperand++) {
998  ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
999  if (!IDX)
1000  break;
1001  Indices.push_back(IDX);
1002  }
1003 
1004  // It must also be the last operand.
1005  if (idxOperand + 1 != e) {
1006  Result = Polynomial();
1007  BasePtr = nullptr;
1008  return;
1009  }
1010 
1011  // Compute the polynomial of the index operand.
1012  computePolynomial(*GEP.getOperand(idxOperand), Result);
1013 
1014  // Compute base offset from zero based index, excluding the last
1015  // variable operand.
1016  BaseOffset =
1017  DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
1018 
1019  // Apply the operations of GEP to the polynomial.
1020  unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
1021  Result.sextOrTrunc(PointerBits);
1022  Result.mul(APInt(PointerBits, ResultSize));
1023  Result.add(BaseOffset);
1024  BasePtr = GEP.getPointerOperand();
1025  }
1026  }
1027  // All other instructions are handled by using the value as base pointer and
1028  // a zero polynomial.
1029  else {
1030  BasePtr = &Ptr;
1031  Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1032  }
1033  }
1034 
1035 #ifndef NDEBUG
1036  void print(raw_ostream &OS) const {
1037  if (PV)
1038  OS << *PV;
1039  else
1040  OS << "(none)";
1041  OS << " + ";
1042  for (unsigned i = 0; i < getDimension(); i++)
1043  OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
1044  OS << "]";
1045  }
1046 #endif
1047 };
1048 
1049 } // anonymous namespace
1050 
1051 bool InterleavedLoadCombineImpl::findPattern(
1052  std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1053  unsigned Factor, const DataLayout &DL) {
1054  for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1055  unsigned i;
1056  // Try to find an interleaved load using the front of Worklist as first line
1057  unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
1058 
1059  // List containing iterators pointing to the VectorInfos of the candidates
1060  std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1061 
1062  for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
1063  if (C->VTy != C0->VTy)
1064  continue;
1065  if (C->BB != C0->BB)
1066  continue;
1067  if (C->PV != C0->PV)
1068  continue;
1069 
1070  // Check the current value matches any of factor - 1 remaining lines
1071  for (i = 1; i < Factor; i++) {
1072  if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
1073  Res[i] = C;
1074  }
1075  }
1076 
1077  for (i = 1; i < Factor; i++) {
1078  if (Res[i] == Candidates.end())
1079  break;
1080  }
1081  if (i == Factor) {
1082  Res[0] = C0;
1083  break;
1084  }
1085  }
1086 
1087  if (Res[0] != Candidates.end()) {
1088  // Move the result into the output
1089  for (unsigned i = 0; i < Factor; i++) {
1090  InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1091  }
1092 
1093  return true;
1094  }
1095  }
1096  return false;
1097 }
1098 
1099 LoadInst *
1100 InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
1101  assert(!LIs.empty() && "No load instructions given.");
1102 
1103  // All LIs are within the same BB. Select the first for a reference.
1104  BasicBlock *BB = (*LIs.begin())->getParent();
1106  *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); });
1107  assert(FLI != BB->end());
1108 
1109  return cast<LoadInst>(FLI);
1110 }
1111 
1112 bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1114  LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
1115 
1116  // The insertion point is the LoadInst which loads the first values. The
1117  // following tests are used to proof that the combined load can be inserted
1118  // just before InsertionPoint.
1119  LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1120 
1121  // Test if the offset is computed
1122  if (!InsertionPoint)
1123  return false;
1124 
1125  std::set<LoadInst *> LIs;
1126  std::set<Instruction *> Is;
1127  std::set<Instruction *> SVIs;
1128 
1129  InstructionCost InterleavedCost;
1132 
1133  // Get the interleave factor
1134  unsigned Factor = InterleavedLoad.size();
1135 
1136  // Merge all input sets used in analysis
1137  for (auto &VI : InterleavedLoad) {
1138  // Generate a set of all load instructions to be combined
1139  LIs.insert(VI.LIs.begin(), VI.LIs.end());
1140 
1141  // Generate a set of all instructions taking part in load
1142  // interleaved. This list excludes the instructions necessary for the
1143  // polynomial construction.
1144  Is.insert(VI.Is.begin(), VI.Is.end());
1145 
1146  // Generate the set of the final ShuffleVectorInst.
1147  SVIs.insert(VI.SVI);
1148  }
1149 
1150  // There is nothing to combine.
1151  if (LIs.size() < 2)
1152  return false;
1153 
1154  // Test if all participating instruction will be dead after the
1155  // transformation. If intermediate results are used, no performance gain can
1156  // be expected. Also sum the cost of the Instructions beeing left dead.
1157  for (auto &I : Is) {
1158  // Compute the old cost
1160 
1161  // The final SVIs are allowed not to be dead, all uses will be replaced
1162  if (SVIs.find(I) != SVIs.end())
1163  continue;
1164 
1165  // If there are users outside the set to be eliminated, we abort the
1166  // transformation. No gain can be expected.
1167  for (auto *U : I->users()) {
1168  if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1169  return false;
1170  }
1171  }
1172 
1173  // We need to have a valid cost in order to proceed.
1174  if (!InstructionCost.isValid())
1175  return false;
1176 
1177  // We know that all LoadInst are within the same BB. This guarantees that
1178  // either everything or nothing is loaded.
1179  LoadInst *First = findFirstLoad(LIs);
1180 
1181  // To be safe that the loads can be combined, iterate over all loads and test
1182  // that the corresponding defining access dominates first LI. This guarantees
1183  // that there are no aliasing stores in between the loads.
1184  auto FMA = MSSA.getMemoryAccess(First);
1185  for (auto LI : LIs) {
1186  auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1187  if (!MSSA.dominates(MADef, FMA))
1188  return false;
1189  }
1190  assert(!LIs.empty() && "There are no LoadInst to combine");
1191 
1192  // It is necessary that insertion point dominates all final ShuffleVectorInst.
1193  for (auto &VI : InterleavedLoad) {
1194  if (!DT.dominates(InsertionPoint, VI.SVI))
1195  return false;
1196  }
1197 
1198  // All checks are done. Add instructions detectable by InterleavedAccessPass
1199  // The old instruction will are left dead.
1200  IRBuilder<> Builder(InsertionPoint);
1201  Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1202  unsigned ElementsPerSVI =
1203  cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
1204  ->getNumElements();
1205  FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
1206 
1207  auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
1208  InterleavedCost = TTI.getInterleavedMemoryOpCost(
1209  Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
1210  InsertionPoint->getPointerAddressSpace(), CostKind);
1211 
1212  if (InterleavedCost >= InstructionCost) {
1213  return false;
1214  }
1215 
1216  // Create a pointer cast for the wide load.
1217  auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0),
1218  ILTy->getPointerTo(),
1219  "interleaved.wide.ptrcast");
1220 
1221  // Create the wide load and update the MemorySSA.
1222  auto LI = Builder.CreateAlignedLoad(ILTy, CI, InsertionPoint->getAlign(),
1223  "interleaved.wide.load");
1224  auto MSSAU = MemorySSAUpdater(&MSSA);
1225  MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1226  LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1227  MSSAU.insertUse(MSSALoad);
1228 
1229  // Create the final SVIs and replace all uses.
1230  int i = 0;
1231  for (auto &VI : InterleavedLoad) {
1233  for (unsigned j = 0; j < ElementsPerSVI; j++)
1234  Mask.push_back(i + j * Factor);
1235 
1236  Builder.SetInsertPoint(VI.SVI);
1237  auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle");
1238  VI.SVI->replaceAllUsesWith(SVI);
1239  i++;
1240  }
1241 
1242  NumInterleavedLoadCombine++;
1243  ORE.emit([&]() {
1244  return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
1245  << "Load interleaved combined with factor "
1246  << ore::NV("Factor", Factor);
1247  });
1248 
1249  return true;
1250 }
1251 
1254  bool changed = false;
1255  unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1256 
1257  auto &DL = F.getParent()->getDataLayout();
1258 
1259  // Start with the highest factor to avoid combining and recombining.
1260  for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1261  std::list<VectorInfo> Candidates;
1262 
1263  for (BasicBlock &BB : F) {
1264  for (Instruction &I : BB) {
1265  if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
1266  // We don't support scalable vectors in this pass.
1267  if (isa<ScalableVectorType>(SVI->getType()))
1268  continue;
1269 
1270  Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
1271 
1272  if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
1273  Candidates.pop_back();
1274  continue;
1275  }
1276 
1277  if (!Candidates.back().isInterleaved(Factor, DL)) {
1278  Candidates.pop_back();
1279  }
1280  }
1281  }
1282  }
1283 
1284  std::list<VectorInfo> InterleavedLoad;
1285  while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
1286  if (combine(InterleavedLoad, ORE)) {
1287  changed = true;
1288  } else {
1289  // Remove the first element of the Interleaved Load but put the others
1290  // back on the list and continue searching
1291  Candidates.splice(Candidates.begin(), InterleavedLoad,
1292  std::next(InterleavedLoad.begin()),
1293  InterleavedLoad.end());
1294  }
1295  InterleavedLoad.clear();
1296  }
1297  }
1298 
1299  return changed;
1300 }
1301 
1302 namespace {
1303 /// This pass combines interleaved loads into a pattern detectable by
1304 /// InterleavedAccessPass.
1305 struct InterleavedLoadCombine : public FunctionPass {
1306  static char ID;
1307 
1308  InterleavedLoadCombine() : FunctionPass(ID) {
1310  }
1311 
1312  StringRef getPassName() const override {
1313  return "Interleaved Load Combine Pass";
1314  }
1315 
1316  bool runOnFunction(Function &F) override {
1317  if (DisableInterleavedLoadCombine)
1318  return false;
1319 
1320  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1321  if (!TPC)
1322  return false;
1323 
1324  LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
1325  << "\n");
1326 
1327  return InterleavedLoadCombineImpl(
1328  F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1329  getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1330  TPC->getTM<TargetMachine>())
1331  .run();
1332  }
1333 
1334  void getAnalysisUsage(AnalysisUsage &AU) const override {
1338  }
1339 
1340 private:
1341 };
1342 } // anonymous namespace
1343 
1345 
1347  InterleavedLoadCombine, DEBUG_TYPE,
1348  "Combine interleaved loads into wide loads and shufflevector instructions",
1349  false, false)
1353  InterleavedLoadCombine, DEBUG_TYPE,
1354  "Combine interleaved loads into wide loads and shufflevector instructions",
1356 
1357 FunctionPass *
1359  auto P = new InterleavedLoadCombine();
1360  return P;
1361 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:76
i
i
Definition: README.txt:29
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::initializeInterleavedLoadCombinePass
void initializeInterleavedLoadCombinePass(PassRegistry &)
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:152
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::ShuffleVectorInst::getShuffleMask
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
Definition: Instructions.cpp:2115
MemorySSAUpdater.h
llvm::Function
Definition: Function.h:60
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
Pass.h
llvm::BitCastInst
This class represents a no-op cast from one type to another.
Definition: Instructions.h:5225
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
Statistic.h
ErrorHandling.h
llvm::TargetTransformInfo
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Definition: TargetTransformInfo.h:167
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:729
llvm::IRBuilder<>
OptimizationRemarkEmitter.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:482
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
Module.h
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::ore::NV
DiagnosticInfoOptimizationBase::Argument NV
Definition: OptimizationRemarkEmitter.h:136
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:422
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:268
llvm::MemoryUse
Represents read-only accesses to memory.
Definition: MemorySSA.h:326
and
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 and
Definition: README.txt:1271
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:224
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:239
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::FixedVectorType::getNumElements
unsigned getNumElements() const
Definition: DerivedTypes.h:568
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
TargetLowering.h
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::MemorySSAWrapperPass
Legacy analysis pass which computes MemorySSA.
Definition: MemorySSA.h:998
TargetMachine.h
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition: TargetLowering.h:3362
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
b
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int b
Definition: README.txt:418
false
Definition: StackSlotColoring.cpp:141
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:392
llvm::Instruction
Definition: Instruction.h:42
llvm::operator-
APInt operator-(APInt)
Definition: APInt.h:2079
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:302
llvm::STATISTIC
STATISTIC(NumFunctions, "Total number of functions")
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:54
llvm::operator<<
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:230
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:919
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
DEBUG_TYPE
#define DEBUG_TYPE
Definition: InterleavedLoadCombinePass.cpp:49
llvm::Instruction::isCommutative
bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
Definition: Instruction.cpp:770
llvm::dxil::PointerTypeAnalysis::run
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Definition: PointerTypeAnalysis.cpp:101
into
Clang compiles this into
Definition: README.txt:504
Passes.h
Combine
Hexagon Vector Combine
Definition: HexagonVectorCombine.cpp:1527
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:274
llvm::instructions
inst_range instructions(Function *F)
Definition: InstIterator.h:133
VI
@ VI
Definition: SIInstrInfo.cpp:7834
uint64_t
llvm::MemorySSAUpdater
Definition: MemorySSAUpdater.h:54
llvm::TargetTransformInfo::getInterleavedMemoryOpCost
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
Definition: TargetTransformInfo.cpp:891
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::ARM_AM::add
@ add
Definition: ARMAddressingModes.h:39
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::MemorySSA
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition: MemorySSA.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:929
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1670
TargetPassConfig.h
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
llvm::OptimizationRemarkEmitter::emit
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Definition: OptimizationRemarkEmitter.cpp:77
llvm::TTI
TargetTransformInfo TTI
Definition: TargetTransformInfo.h:162
llvm::operator+
APInt operator+(APInt a, const APInt &b)
Definition: APInt.h:2084
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::BinaryOperator
Definition: InstrTypes.h:188
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
DataLayout.h
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:868
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
TargetSubtargetInfo.h
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:429
llvm::InstructionCost::isValid
bool isValid() const
Definition: InstructionCost.h:79
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176
llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1632
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
j
return j(j<< 16)
llvm::Instruction::isAtomic
bool isAtomic() const
Return true if this instruction has an AtomicOrdering of unordered or higher.
Definition: Instruction.cpp:616
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:341
Function.h
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:774
llvm::TargetTransformInfo::getInstructionCost
InstructionCost getInstructionCost(const Instruction *I, enum TargetCostKind kind) const
Query the cost of a specified instruction.
Definition: TargetTransformInfo.h:224
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::makeArrayRef
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:475
llvm::ShuffleVectorInst
This instruction constructs a fixed permutation of two input vectors.
Definition: Instructions.h:2008
llvm::OptimizationRemark
Diagnostic information for applied optimization remarks.
Definition: DiagnosticInfo.h:690
MemorySSA.h
Instructions.h
llvm::LoadInst::isVolatile
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Definition: Instructions.h:213
Dominators.h
llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:676
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(InterleavedLoadCombine, DEBUG_TYPE, "Combine interleaved loads into wide loads and shufflevector instructions", false, false) INITIALIZE_PASS_END(InterleavedLoadCombine
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
TargetTransformInfo.h
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:97
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::createInterleavedLoadCombinePass
FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
Definition: InterleavedLoadCombinePass.cpp:1358
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
lshr
Vector Shift Left don t map to llvm shl and lshr
Definition: README_P9.txt:118
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:405
raw_ostream.h
n
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality for n
Definition: README.txt:685
combine
vector combine
Definition: VectorCombine.cpp:1548
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
Debug.h
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37