LLVM  14.0.0git
SVEIntrinsicOpts.cpp
Go to the documentation of this file.
1 //===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Performs general IR level optimizations on SVE intrinsics.
11 //
12 // This pass performs the following optimizations:
13 //
14 // - removes unnecessary ptrue intrinsics (llvm.aarch64.sve.ptrue), e.g:
15 // %1 = @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
16 // %2 = @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
17 // ; (%1 can be replaced with a reinterpret of %2)
18 //
19 // - optimizes ptest intrinsics where the operands are being needlessly
20 // converted to and from svbool_t.
21 //
22 //===----------------------------------------------------------------------===//
23 
24 #include "AArch64.h"
25 #include "Utils/AArch64BaseInfo.h"
27 #include "llvm/ADT/SetVector.h"
28 #include "llvm/IR/Constants.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/Instructions.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/IR/IntrinsicsAArch64.h"
34 #include "llvm/IR/LLVMContext.h"
35 #include "llvm/IR/PatternMatch.h"
36 #include "llvm/InitializePasses.h"
37 #include "llvm/Support/Debug.h"
38 
39 using namespace llvm;
40 using namespace llvm::PatternMatch;
41 
42 #define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
43 
44 namespace llvm {
46 }
47 
48 namespace {
49 struct SVEIntrinsicOpts : public ModulePass {
50  static char ID; // Pass identification, replacement for typeid
51  SVEIntrinsicOpts() : ModulePass(ID) {
53  }
54 
55  bool runOnModule(Module &M) override;
56  void getAnalysisUsage(AnalysisUsage &AU) const override;
57 
58 private:
59  bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
61  bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
62 
63  /// Operates at the function-scope. I.e., optimizations are applied local to
64  /// the functions themselves.
65  bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
66 };
67 } // end anonymous namespace
68 
69 void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
71  AU.setPreservesCFG();
72 }
73 
74 char SVEIntrinsicOpts::ID = 0;
75 static const char *name = "SVE intrinsics optimizations";
76 INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
78 INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
79 
81  return new SVEIntrinsicOpts();
82 }
83 
84 /// Checks if a ptrue intrinsic call is promoted. The act of promoting a
85 /// ptrue will introduce zeroing. For example:
86 ///
87 /// %1 = <vscale x 4 x i1> call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
88 /// %2 = <vscale x 16 x i1> call @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
89 /// %3 = <vscale x 8 x i1> call @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
90 ///
91 /// %1 is promoted, because it is converted:
92 ///
93 /// <vscale x 4 x i1> => <vscale x 16 x i1> => <vscale x 8 x i1>
94 ///
95 /// via a sequence of the SVE reinterpret intrinsics convert.{to,from}.svbool.
96 static bool isPTruePromoted(IntrinsicInst *PTrue) {
97  // Find all users of this intrinsic that are calls to convert-to-svbool
98  // reinterpret intrinsics.
99  SmallVector<IntrinsicInst *, 4> ConvertToUses;
100  for (User *User : PTrue->users()) {
101  if (match(User, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>())) {
102  ConvertToUses.push_back(cast<IntrinsicInst>(User));
103  }
104  }
105 
106  // If no such calls were found, this is ptrue is not promoted.
107  if (ConvertToUses.empty())
108  return false;
109 
110  // Otherwise, try to find users of the convert-to-svbool intrinsics that are
111  // calls to the convert-from-svbool intrinsic, and would result in some lanes
112  // being zeroed.
113  const auto *PTrueVTy = cast<ScalableVectorType>(PTrue->getType());
114  for (IntrinsicInst *ConvertToUse : ConvertToUses) {
115  for (User *User : ConvertToUse->users()) {
116  auto *IntrUser = dyn_cast<IntrinsicInst>(User);
117  if (IntrUser && IntrUser->getIntrinsicID() ==
118  Intrinsic::aarch64_sve_convert_from_svbool) {
119  const auto *IntrUserVTy = cast<ScalableVectorType>(IntrUser->getType());
120 
121  // Would some lanes become zeroed by the conversion?
122  if (IntrUserVTy->getElementCount().getKnownMinValue() >
123  PTrueVTy->getElementCount().getKnownMinValue())
124  // This is a promoted ptrue.
125  return true;
126  }
127  }
128  }
129 
130  // If no matching calls were found, this is not a promoted ptrue.
131  return false;
132 }
133 
134 /// Attempts to coalesce ptrues in a basic block.
135 bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls(
137  if (PTrues.size() <= 1)
138  return false;
139 
140  // Find the ptrue with the most lanes.
141  auto *MostEncompassingPTrue = *std::max_element(
142  PTrues.begin(), PTrues.end(), [](auto *PTrue1, auto *PTrue2) {
143  auto *PTrue1VTy = cast<ScalableVectorType>(PTrue1->getType());
144  auto *PTrue2VTy = cast<ScalableVectorType>(PTrue2->getType());
145  return PTrue1VTy->getElementCount().getKnownMinValue() <
146  PTrue2VTy->getElementCount().getKnownMinValue();
147  });
148 
149  // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving
150  // behind only the ptrues to be coalesced.
151  PTrues.remove(MostEncompassingPTrue);
152  PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); });
153 
154  // Hoist MostEncompassingPTrue to the start of the basic block. It is always
155  // safe to do this, since ptrue intrinsic calls are guaranteed to have no
156  // predecessors.
157  MostEncompassingPTrue->moveBefore(BB, BB.getFirstInsertionPt());
158 
159  LLVMContext &Ctx = BB.getContext();
160  IRBuilder<> Builder(Ctx);
161  Builder.SetInsertPoint(&BB, ++MostEncompassingPTrue->getIterator());
162 
163  auto *MostEncompassingPTrueVTy =
164  cast<VectorType>(MostEncompassingPTrue->getType());
165  auto *ConvertToSVBool = Builder.CreateIntrinsic(
166  Intrinsic::aarch64_sve_convert_to_svbool, {MostEncompassingPTrueVTy},
167  {MostEncompassingPTrue});
168 
169  bool ConvertFromCreated = false;
170  for (auto *PTrue : PTrues) {
171  auto *PTrueVTy = cast<VectorType>(PTrue->getType());
172 
173  // Only create the converts if the types are not already the same, otherwise
174  // just use the most encompassing ptrue.
175  if (MostEncompassingPTrueVTy != PTrueVTy) {
176  ConvertFromCreated = true;
177 
178  Builder.SetInsertPoint(&BB, ++ConvertToSVBool->getIterator());
179  auto *ConvertFromSVBool =
180  Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
181  {PTrueVTy}, {ConvertToSVBool});
182  PTrue->replaceAllUsesWith(ConvertFromSVBool);
183  } else
184  PTrue->replaceAllUsesWith(MostEncompassingPTrue);
185 
186  PTrue->eraseFromParent();
187  }
188 
189  // We never used the ConvertTo so remove it
190  if (!ConvertFromCreated)
191  ConvertToSVBool->eraseFromParent();
192 
193  return true;
194 }
195 
196 /// The goal of this function is to remove redundant calls to the SVE ptrue
197 /// intrinsic in each basic block within the given functions.
198 ///
199 /// SVE ptrues have two representations in LLVM IR:
200 /// - a logical representation -- an arbitrary-width scalable vector of i1s,
201 /// i.e. <vscale x N x i1>.
202 /// - a physical representation (svbool, <vscale x 16 x i1>) -- a 16-element
203 /// scalable vector of i1s, i.e. <vscale x 16 x i1>.
204 ///
205 /// The SVE ptrue intrinsic is used to create a logical representation of an SVE
206 /// predicate. Suppose that we have two SVE ptrue intrinsic calls: P1 and P2. If
207 /// P1 creates a logical SVE predicate that is at least as wide as the logical
208 /// SVE predicate created by P2, then all of the bits that are true in the
209 /// physical representation of P2 are necessarily also true in the physical
210 /// representation of P1. P1 'encompasses' P2, therefore, the intrinsic call to
211 /// P2 is redundant and can be replaced by an SVE reinterpret of P1 via
212 /// convert.{to,from}.svbool.
213 ///
214 /// Currently, this pass only coalesces calls to SVE ptrue intrinsics
215 /// if they match the following conditions:
216 ///
217 /// - the call to the intrinsic uses either the SV_ALL or SV_POW2 patterns.
218 /// SV_ALL indicates that all bits of the predicate vector are to be set to
219 /// true. SV_POW2 indicates that all bits of the predicate vector up to the
220 /// largest power-of-two are to be set to true.
221 /// - the result of the call to the intrinsic is not promoted to a wider
222 /// predicate. In this case, keeping the extra ptrue leads to better codegen
223 /// -- coalescing here would create an irreducible chain of SVE reinterprets
224 /// via convert.{to,from}.svbool.
225 ///
226 /// EXAMPLE:
227 ///
228 /// %1 = <vscale x 8 x i1> ptrue(i32 SV_ALL)
229 /// ; Logical: <1, 1, 1, 1, 1, 1, 1, 1>
230 /// ; Physical: <1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0>
231 /// ...
232 ///
233 /// %2 = <vscale x 4 x i1> ptrue(i32 SV_ALL)
234 /// ; Logical: <1, 1, 1, 1>
235 /// ; Physical: <1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0>
236 /// ...
237 ///
238 /// Here, %2 can be replaced by an SVE reinterpret of %1, giving, for instance:
239 ///
240 /// %1 = <vscale x 8 x i1> ptrue(i32 i31)
241 /// %2 = <vscale x 16 x i1> convert.to.svbool(<vscale x 8 x i1> %1)
242 /// %3 = <vscale x 4 x i1> convert.from.svbool(<vscale x 16 x i1> %2)
243 ///
244 bool SVEIntrinsicOpts::optimizePTrueIntrinsicCalls(
245  SmallSetVector<Function *, 4> &Functions) {
246  bool Changed = false;
247 
248  for (auto *F : Functions) {
249  for (auto &BB : *F) {
252 
253  // For each basic block, collect the used ptrues and try to coalesce them.
254  for (Instruction &I : BB) {
255  if (I.use_empty())
256  continue;
257 
258  auto *IntrI = dyn_cast<IntrinsicInst>(&I);
259  if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
260  continue;
261 
262  const auto PTruePattern =
263  cast<ConstantInt>(IntrI->getOperand(0))->getZExtValue();
264 
265  if (PTruePattern == AArch64SVEPredPattern::all)
266  SVAllPTrues.insert(IntrI);
267  if (PTruePattern == AArch64SVEPredPattern::pow2)
268  SVPow2PTrues.insert(IntrI);
269  }
270 
271  Changed |= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
272  Changed |= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
273  }
274  }
275 
276  return Changed;
277 }
278 
279 bool SVEIntrinsicOpts::optimizeFunctions(
280  SmallSetVector<Function *, 4> &Functions) {
281  bool Changed = false;
282 
283  Changed |= optimizePTrueIntrinsicCalls(Functions);
284 
285  return Changed;
286 }
287 
288 bool SVEIntrinsicOpts::runOnModule(Module &M) {
289  bool Changed = false;
291 
292  // Check for SVE intrinsic declarations first so that we only iterate over
293  // relevant functions. Where an appropriate declaration is found, store the
294  // function(s) where it is used so we can target these only.
295  for (auto &F : M.getFunctionList()) {
296  if (!F.isDeclaration())
297  continue;
298 
299  switch (F.getIntrinsicID()) {
300  case Intrinsic::aarch64_sve_ptrue:
301  for (User *U : F.users())
302  Functions.insert(cast<Instruction>(U)->getFunction());
303  break;
304  default:
305  break;
306  }
307  }
308 
309  if (!Functions.empty())
310  Changed |= optimizeFunctions(Functions);
311 
312  return Changed;
313 }
isPTruePromoted
static bool isPTruePromoted(IntrinsicInst *PTrue)
Checks if a ptrue intrinsic call is promoted.
Definition: SVEIntrinsicOpts.cpp:96
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
AArch64.h
llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition: Pass.h:238
IntrinsicInst.h
getFunction
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:255
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::size
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:77
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::IRBuilder<>
AArch64BaseInfo.h
llvm::createSVEIntrinsicOptsPass
ModulePass * createSVEIntrinsicOptsPass()
Definition: SVEIntrinsicOpts.cpp:80
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SVEIntrinsicOpts.cpp:42
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::remove_if
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition: SetVector.h:199
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::remove
bool remove(const value_type &X)
Remove an item from the set vector.
Definition: SetVector.h:157
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::begin
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition: SetVector.h:82
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
Constants.h
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::User
Definition: User.h:44
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:142
llvm::Instruction
Definition: Instruction.h:45
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:281
llvm::PassRegistry
PassRegistry - This class manages the registration and intitialization of the pass subsystem as appli...
Definition: PassRegistry.h:38
PatternMatch.h
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::empty
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:72
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition: LegalizerInfo.h:220
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
I
#define I(x, y, z)
Definition: MD5.cpp:59
IRBuilder.h
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::insert
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:141
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
name
static const char * name
Definition: SVEIntrinsicOpts.cpp:75
llvm::initializeSVEIntrinsicOptsPass
void initializeSVEIntrinsicOptsPass(PassRegistry &)
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::SetVector< T, SmallVector< T, N >, SmallDenseSet< T, N > >::end
iterator end()
Get an iterator to the end of the SetVector.
Definition: SetVector.h:92
Instructions.h
PostOrderIterator.h
INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:51
Dominators.h
llvm::PatternMatch
Definition: PatternMatch.h:47
llvm::SmallSetVector
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:307
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
LLVMContext.h
InitializePasses.h
Debug.h
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
SetVector.h
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38