LLVM  12.0.0git
AMDGPUUnifyDivergentExitNodes.cpp
Go to the documentation of this file.
1 //===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
10 // there is at most one ret and one unreachable instruction, it ensures there is
11 // at most one divergent exiting block.
12 //
13 // StructurizeCFG can't deal with multi-exit regions formed by branches to
14 // multiple return nodes. It is not desirable to structurize regions with
15 // uniform branches, so unifying those to the same return block as divergent
16 // branches inhibits use of scalar branching. It still can't deal with the case
17 // where one branch goes to return, and one unreachable. Replace unreachable in
18 // this case with a return.
19 //
20 //===----------------------------------------------------------------------===//
21 
22 #include "AMDGPU.h"
23 #include "llvm/ADT/ArrayRef.h"
24 #include "llvm/ADT/SmallPtrSet.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringRef.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/CFG.h"
32 #include "llvm/IR/Constants.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/IR/InstrTypes.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/Intrinsics.h"
37 #include "llvm/IR/IRBuilder.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/InitializePasses.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/Casting.h"
42 #include "llvm/Transforms/Scalar.h"
43 #include "llvm/Transforms/Utils.h"
45 
46 using namespace llvm;
47 
48 #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
49 
50 namespace {
51 
52 class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
53 public:
54  static char ID; // Pass identification, replacement for typeid
55 
56  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
58  }
59 
60  // We can preserve non-critical-edgeness when we unify function exit nodes
61  void getAnalysisUsage(AnalysisUsage &AU) const override;
62  bool runOnFunction(Function &F) override;
63 };
64 
65 } // end anonymous namespace
66 
68 
70 
71 INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
72  "Unify divergent function exit nodes", false, false)
75 INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
76  "Unify divergent function exit nodes", false, false)
77 
78 void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
79  // TODO: Preserve dominator tree.
80  AU.addRequired<PostDominatorTreeWrapperPass>();
81 
82  AU.addRequired<LegacyDivergenceAnalysis>();
83 
84  // No divergent values are changed, only blocks and branch edges.
85  AU.addPreserved<LegacyDivergenceAnalysis>();
86 
87  // We preserve the non-critical-edgeness property
88  AU.addPreservedID(BreakCriticalEdgesID);
89 
90  // This is a cluster of orthogonal Transforms
91  AU.addPreservedID(LowerSwitchID);
93 
94  AU.addRequired<TargetTransformInfoWrapperPass>();
95 }
96 
97 /// \returns true if \p BB is reachable through only uniform branches.
98 /// XXX - Is there a more efficient way to find this?
100  BasicBlock &BB) {
103 
104  for (BasicBlock *Pred : predecessors(&BB))
105  Stack.push_back(Pred);
106 
107  while (!Stack.empty()) {
108  BasicBlock *Top = Stack.pop_back_val();
109  if (!DA.isUniform(Top->getTerminator()))
110  return false;
111 
112  for (BasicBlock *Pred : predecessors(Top)) {
113  if (Visited.insert(Pred).second)
114  Stack.push_back(Pred);
115  }
116  }
117 
118  return true;
119 }
120 
121 static void removeDoneExport(Function &F) {
122  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
123  for (BasicBlock &BB : F) {
124  for (Instruction &I : BB) {
125  if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
126  if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
127  Intrin->setArgOperand(6, BoolFalse); // done
128  } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
129  Intrin->setArgOperand(4, BoolFalse); // done
130  }
131  }
132  }
133  }
134 }
135 
137  ArrayRef<BasicBlock *> ReturningBlocks,
138  bool InsertExport,
139  const TargetTransformInfo &TTI,
140  StringRef Name) {
141  // Otherwise, we need to insert a new basic block into the function, add a PHI
142  // nodes (if the function returns values), and convert all of the return
143  // instructions into unconditional branches.
144  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
145  IRBuilder<> B(NewRetBlock);
146 
147  if (InsertExport) {
148  // Ensure that there's only one "done" export in the shader by removing the
149  // "done" bit set on the original final export. More than one "done" export
150  // can lead to undefined behavior.
151  removeDoneExport(F);
152 
153  Value *Undef = UndefValue::get(B.getFloatTy());
154  B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
155  {
156  B.getInt32(9), // target, SQ_EXP_NULL
157  B.getInt32(0), // enabled channels
158  Undef, Undef, Undef, Undef, // values
159  B.getTrue(), // done
160  B.getTrue(), // valid mask
161  });
162  }
163 
164  PHINode *PN = nullptr;
165  if (F.getReturnType()->isVoidTy()) {
166  B.CreateRetVoid();
167  } else {
168  // If the function doesn't return void... add a PHI node to the block...
169  PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
170  "UnifiedRetVal");
171  assert(!InsertExport);
172  B.CreateRet(PN);
173  }
174 
175  // Loop over all of the blocks, replacing the return instruction with an
176  // unconditional branch.
177  for (BasicBlock *BB : ReturningBlocks) {
178  // Add an incoming element to the PHI node for every return instruction that
179  // is merging into this new block...
180  if (PN)
181  PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
182 
183  // Remove and delete the return inst.
184  BB->getTerminator()->eraseFromParent();
185  BranchInst::Create(NewRetBlock, BB);
186  }
187 
188  for (BasicBlock *BB : ReturningBlocks) {
189  // Cleanup possible branch to unconditional branch to the return.
190  simplifyCFG(BB, TTI, SimplifyCFGOptions().bonusInstThreshold(2));
191  }
192 
193  return NewRetBlock;
194 }
195 
197  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
198 
199  // If there's only one exit, we don't need to do anything, unless this is a
200  // pixel shader and that exit is an infinite loop, since we still have to
201  // insert an export in that case.
202  if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
203  return false;
204 
205  LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
206 
207  // Loop over all of the blocks in a function, tracking all of the blocks that
208  // return.
209  SmallVector<BasicBlock *, 4> ReturningBlocks;
210  SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
211  SmallVector<BasicBlock *, 4> UnreachableBlocks;
212 
213  // Dummy return block for infinite loop.
214  BasicBlock *DummyReturnBB = nullptr;
215 
216  bool InsertExport = false;
217 
218  bool Changed = false;
219  for (BasicBlock *BB : PDT.roots()) {
220  if (isa<ReturnInst>(BB->getTerminator())) {
221  if (!isUniformlyReached(DA, *BB))
222  ReturningBlocks.push_back(BB);
223  else
224  UniformlyReachedRetBlocks.push_back(BB);
225  } else if (isa<UnreachableInst>(BB->getTerminator())) {
226  if (!isUniformlyReached(DA, *BB))
227  UnreachableBlocks.push_back(BB);
228  } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
229 
231  if (DummyReturnBB == nullptr) {
232  DummyReturnBB = BasicBlock::Create(F.getContext(),
233  "DummyReturnBlock", &F);
234  Type *RetTy = F.getReturnType();
235  Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
236 
237  // For pixel shaders, the producer guarantees that an export is
238  // executed before each return instruction. However, if there is an
239  // infinite loop and we insert a return ourselves, we need to uphold
240  // that guarantee by inserting a null export. This can happen e.g. in
241  // an infinite loop with kill instructions, which is supposed to
242  // terminate. However, we don't need to do this if there is a non-void
243  // return value, since then there is an epilog afterwards which will
244  // still export.
245  //
246  // Note: In the case where only some threads enter the infinite loop,
247  // this can result in the null export happening redundantly after the
248  // original exports. However, The last "real" export happens after all
249  // the threads that didn't enter an infinite loop converged, which
250  // means that the only extra threads to execute the null export are
251  // threads that entered the infinite loop, and they only could've
252  // exited through being killed which sets their exec bit to 0.
253  // Therefore, unless there's an actual infinite loop, which can have
254  // invalid results, or there's a kill after the last export, which we
255  // assume the frontend won't do, this export will have the same exec
256  // mask as the last "real" export, and therefore the valid mask will be
257  // overwritten with the same value and will still be correct. Also,
258  // even though this forces an extra unnecessary export wait, we assume
259  // that this happens rare enough in practice to that we don't have to
260  // worry about performance.
262  RetTy->isVoidTy()) {
263  InsertExport = true;
264  }
265 
266  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
267  ReturningBlocks.push_back(DummyReturnBB);
268  }
269 
270  if (BI->isUnconditional()) {
271  BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
272  BI->eraseFromParent(); // Delete the unconditional branch.
273  // Add a new conditional branch with a dummy edge to the return block.
274  BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
275  } else { // Conditional branch.
276  // Create a new transition block to hold the conditional branch.
277  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
278 
279  // Create a branch that will always branch to the transition block and
280  // references DummyReturnBB.
282  BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
283  }
284  Changed = true;
285  }
286  }
287 
288  if (!UnreachableBlocks.empty()) {
289  BasicBlock *UnreachableBlock = nullptr;
290 
291  if (UnreachableBlocks.size() == 1) {
292  UnreachableBlock = UnreachableBlocks.front();
293  } else {
294  UnreachableBlock = BasicBlock::Create(F.getContext(),
295  "UnifiedUnreachableBlock", &F);
296  new UnreachableInst(F.getContext(), UnreachableBlock);
297 
298  for (BasicBlock *BB : UnreachableBlocks) {
299  // Remove and delete the unreachable inst.
300  BB->getTerminator()->eraseFromParent();
301  BranchInst::Create(UnreachableBlock, BB);
302  }
303  Changed = true;
304  }
305 
306  if (!ReturningBlocks.empty()) {
307  // Don't create a new unreachable inst if we have a return. The
308  // structurizer/annotator can't handle the multiple exits
309 
310  Type *RetTy = F.getReturnType();
311  Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
312  // Remove and delete the unreachable inst.
313  UnreachableBlock->getTerminator()->eraseFromParent();
314 
315  Function *UnreachableIntrin =
316  Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
317 
318  // Insert a call to an intrinsic tracking that this is an unreachable
319  // point, in case we want to kill the active lanes or something later.
320  CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
321 
322  // Don't create a scalar trap. We would only want to trap if this code was
323  // really reached, but a scalar trap would happen even if no lanes
324  // actually reached here.
325  ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
326  ReturningBlocks.push_back(UnreachableBlock);
327  Changed = true;
328  }
329  }
330 
331  // Now handle return blocks.
332  if (ReturningBlocks.empty())
333  return Changed; // No blocks return
334 
335  if (ReturningBlocks.size() == 1 && !InsertExport)
336  return Changed; // Already has a single return block
337 
338  const TargetTransformInfo &TTI
339  = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
340 
341  // Unify returning blocks. If we are going to insert the export it is also
342  // necessary to include blocks that are uniformly reached, because in addition
343  // to inserting the export the "done" bits on existing exports will be cleared
344  // and we do not want to end up with the normal export in a non-unified,
345  // uniformly reached block with the "done" bit cleared.
346  auto BlocksToUnify = std::move(ReturningBlocks);
347  if (InsertExport) {
348  BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
349  UniformlyReachedRetBlocks.end());
350  }
351 
352  unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
353  "UnifiedReturnBlock");
354  return true;
355 }
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks &#39;this&#39; from the containing basic block and deletes it.
Definition: Instruction.cpp:80
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:749
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
This class represents lattice values for constants.
Definition: AllocatorList.h:23
amdgpu Simplify well known AMD library false FunctionCallee Value const Twine & Name
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
F(f)
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.cpp:150
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, Instruction *InsertBefore=nullptr)
static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, BasicBlock &BB)
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:93
static BasicBlock * unifyReturnBlockSet(Function &F, ArrayRef< BasicBlock *> ReturningBlocks, bool InsertExport, const TargetTransformInfo &TTI, StringRef Name)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
Function * getDeclaration(Module *M, ID id, ArrayRef< Type *> Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1161
bool isVoidTy() const
Return true if this is &#39;void&#39;.
Definition: Type.h:138
static bool runOnFunction(Function &F, bool PostInlining)
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:170
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Wrapper pass for TargetTransformInfo.
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
Conditional or Unconditional Branch instruction.
char & BreakCriticalEdgesID
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:156
This function has undefined behavior.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
Represent the analysis usage information of a pass.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:100
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:252
char & LowerSwitchID
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1665
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Calling convention used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:205
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:439
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
char & AMDGPUUnifyDivergentExitNodesID
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:219
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:883
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:420
static BranchInst * Create(BasicBlock *IfTrue, Instruction *InsertBefore=nullptr)
pred_range predecessors(BasicBlock *BB)
Definition: CFG.h:125
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:742
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iterator_range< typename GraphTraits< GraphType >::nodes_iterator > nodes(const GraphType &G)
Definition: GraphTraits.h:108
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink &#39;this&#39; from the containing function and delete it.
Definition: BasicBlock.cpp:127
#define I(x, y, z)
Definition: MD5.cpp:59
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:392
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
LLVM Value Representation.
Definition: Value.h:74
bool isUniform(const Value *V) const
print Print MemDeps of function
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
static void removeDoneExport(Function &F)
bool simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, const SimplifyCFGOptions &Options={}, SmallPtrSetImpl< BasicBlock *> *LoopHeaders=nullptr)
This function is used to do simplification of a CFG.
This pass exposes codegen information to IR-level passes.
void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry &)
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)