LLVM  16.0.0git
AMDGPUAtomicOptimizer.cpp
Go to the documentation of this file.
1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
12 /// location.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/InstVisitor.h"
22 #include "llvm/IR/IntrinsicsAMDGPU.h"
23 #include "llvm/InitializePasses.h"
26 
27 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
28 
29 using namespace llvm;
30 using namespace llvm::AMDGPU;
31 
32 namespace {
33 
34 struct ReplacementInfo {
35  Instruction *I;
37  unsigned ValIdx;
38  bool ValDivergent;
39 };
40 
41 class AMDGPUAtomicOptimizer : public FunctionPass,
42  public InstVisitor<AMDGPUAtomicOptimizer> {
43 private:
46  const DataLayout *DL;
47  DominatorTree *DT;
48  const GCNSubtarget *ST;
49  bool IsPixelShader;
50 
51  Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
52  Value *const Identity) const;
54  Value *const Identity) const;
55  Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
56  void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
57  bool ValDivergent) const;
58 
59 public:
60  static char ID;
61 
62  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
63 
64  bool runOnFunction(Function &F) override;
65 
66  void getAnalysisUsage(AnalysisUsage &AU) const override {
70  }
71 
72  void visitAtomicRMWInst(AtomicRMWInst &I);
73  void visitIntrinsicInst(IntrinsicInst &I);
74 };
75 
76 } // namespace
77 
79 
81 
83  if (skipFunction(F)) {
84  return false;
85  }
86 
87  DA = &getAnalysis<LegacyDivergenceAnalysis>();
88  DL = &F.getParent()->getDataLayout();
89  DominatorTreeWrapperPass *const DTW =
90  getAnalysisIfAvailable<DominatorTreeWrapperPass>();
91  DT = DTW ? &DTW->getDomTree() : nullptr;
92  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
93  const TargetMachine &TM = TPC.getTM<TargetMachine>();
94  ST = &TM.getSubtarget<GCNSubtarget>(F);
95  IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
96 
97  visit(F);
98 
99  const bool Changed = !ToReplace.empty();
100 
101  for (ReplacementInfo &Info : ToReplace) {
102  optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
103  }
104 
105  ToReplace.clear();
106 
107  return Changed;
108 }
109 
110 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
111  // Early exit for unhandled address space atomic instructions.
112  switch (I.getPointerAddressSpace()) {
113  default:
114  return;
117  break;
118  }
119 
120  AtomicRMWInst::BinOp Op = I.getOperation();
121 
122  switch (Op) {
123  default:
124  return;
125  case AtomicRMWInst::Add:
126  case AtomicRMWInst::Sub:
127  case AtomicRMWInst::And:
128  case AtomicRMWInst::Or:
129  case AtomicRMWInst::Xor:
130  case AtomicRMWInst::Max:
131  case AtomicRMWInst::Min:
132  case AtomicRMWInst::UMax:
133  case AtomicRMWInst::UMin:
134  break;
135  }
136 
137  const unsigned PtrIdx = 0;
138  const unsigned ValIdx = 1;
139 
140  // If the pointer operand is divergent, then each lane is doing an atomic
141  // operation on a different address, and we cannot optimize that.
142  if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
143  return;
144  }
145 
146  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
147 
148  // If the value operand is divergent, each lane is contributing a different
149  // value to the atomic calculation. We can only optimize divergent values if
150  // we have DPP available on our subtarget, and the atomic operation is 32
151  // bits.
152  if (ValDivergent &&
153  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
154  return;
155  }
156 
157  // If we get here, we can optimize the atomic using a single wavefront-wide
158  // atomic operation to do the calculation for the entire wavefront, so
159  // remember the instruction so we can come back to it.
160  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
161 
162  ToReplace.push_back(Info);
163 }
164 
165 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
167 
168  switch (I.getIntrinsicID()) {
169  default:
170  return;
171  case Intrinsic::amdgcn_buffer_atomic_add:
172  case Intrinsic::amdgcn_struct_buffer_atomic_add:
173  case Intrinsic::amdgcn_raw_buffer_atomic_add:
175  break;
176  case Intrinsic::amdgcn_buffer_atomic_sub:
177  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
178  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
180  break;
181  case Intrinsic::amdgcn_buffer_atomic_and:
182  case Intrinsic::amdgcn_struct_buffer_atomic_and:
183  case Intrinsic::amdgcn_raw_buffer_atomic_and:
185  break;
186  case Intrinsic::amdgcn_buffer_atomic_or:
187  case Intrinsic::amdgcn_struct_buffer_atomic_or:
188  case Intrinsic::amdgcn_raw_buffer_atomic_or:
190  break;
191  case Intrinsic::amdgcn_buffer_atomic_xor:
192  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
193  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
195  break;
196  case Intrinsic::amdgcn_buffer_atomic_smin:
197  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
198  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
200  break;
201  case Intrinsic::amdgcn_buffer_atomic_umin:
202  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
203  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
205  break;
206  case Intrinsic::amdgcn_buffer_atomic_smax:
207  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
208  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
210  break;
211  case Intrinsic::amdgcn_buffer_atomic_umax:
212  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
213  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
215  break;
216  }
217 
218  const unsigned ValIdx = 0;
219 
220  const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
221 
222  // If the value operand is divergent, each lane is contributing a different
223  // value to the atomic calculation. We can only optimize divergent values if
224  // we have DPP available on our subtarget, and the atomic operation is 32
225  // bits.
226  if (ValDivergent &&
227  (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
228  return;
229  }
230 
231  // If any of the other arguments to the intrinsic are divergent, we can't
232  // optimize the operation.
233  for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
234  if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
235  return;
236  }
237  }
238 
239  // If we get here, we can optimize the atomic using a single wavefront-wide
240  // atomic operation to do the calculation for the entire wavefront, so
241  // remember the instruction so we can come back to it.
242  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};
243 
244  ToReplace.push_back(Info);
245 }
246 
247 // Use the builder to create the non-atomic counterpart of the specified
248 // atomicrmw binary op.
250  Value *LHS, Value *RHS) {
251  CmpInst::Predicate Pred;
252 
253  switch (Op) {
254  default:
255  llvm_unreachable("Unhandled atomic op");
256  case AtomicRMWInst::Add:
257  return B.CreateBinOp(Instruction::Add, LHS, RHS);
258  case AtomicRMWInst::Sub:
259  return B.CreateBinOp(Instruction::Sub, LHS, RHS);
260  case AtomicRMWInst::And:
261  return B.CreateBinOp(Instruction::And, LHS, RHS);
262  case AtomicRMWInst::Or:
263  return B.CreateBinOp(Instruction::Or, LHS, RHS);
264  case AtomicRMWInst::Xor:
265  return B.CreateBinOp(Instruction::Xor, LHS, RHS);
266 
267  case AtomicRMWInst::Max:
268  Pred = CmpInst::ICMP_SGT;
269  break;
270  case AtomicRMWInst::Min:
271  Pred = CmpInst::ICMP_SLT;
272  break;
273  case AtomicRMWInst::UMax:
274  Pred = CmpInst::ICMP_UGT;
275  break;
276  case AtomicRMWInst::UMin:
277  Pred = CmpInst::ICMP_ULT;
278  break;
279  }
280  Value *Cond = B.CreateICmp(Pred, LHS, RHS);
281  return B.CreateSelect(Cond, LHS, RHS);
282 }
283 
284 // Use the builder to create a reduction of V across the wavefront, with all
285 // lanes active, returning the same result in all lanes.
286 Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
288  Value *const Identity) const {
289  Type *const Ty = V->getType();
290  Module *M = B.GetInsertBlock()->getModule();
291  Function *UpdateDPP =
292  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
293 
294  // Reduce within each row of 16 lanes.
295  for (unsigned Idx = 0; Idx < 4; Idx++) {
297  B, Op, V,
298  B.CreateCall(UpdateDPP,
299  {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
300  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
301  }
302 
303  // Reduce within each pair of rows (i.e. 32 lanes).
304  assert(ST->hasPermLaneX16());
306  B, Op, V,
307  B.CreateIntrinsic(
308  Intrinsic::amdgcn_permlanex16, {},
309  {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
310 
311  if (ST->isWave32())
312  return V;
313 
314  if (ST->hasPermLane64()) {
315  // Reduce across the upper and lower 32 lanes.
316  return buildNonAtomicBinOp(
317  B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
318  }
319 
320  // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
321  // combine them with a scalar operation.
322  Function *ReadLane =
323  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
324  Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
325  Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
326  return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
327 }
328 
329 // Use the builder to create an inclusive scan of V across the wavefront, with
330 // all lanes active.
331 Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
332  Value *V, Value *const Identity) const {
333  Type *const Ty = V->getType();
334  Module *M = B.GetInsertBlock()->getModule();
335  Function *UpdateDPP =
336  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
337 
338  for (unsigned Idx = 0; Idx < 4; Idx++) {
340  B, Op, V,
341  B.CreateCall(UpdateDPP,
342  {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
343  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
344  }
345  if (ST->hasDPPBroadcasts()) {
346  // GFX9 has DPP row broadcast operations.
348  B, Op, V,
349  B.CreateCall(UpdateDPP,
350  {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
351  B.getInt32(0xf), B.getFalse()}));
353  B, Op, V,
354  B.CreateCall(UpdateDPP,
355  {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
356  B.getInt32(0xf), B.getFalse()}));
357  } else {
358  // On GFX10 all DPP operations are confined to a single row. To get cross-
359  // row operations we have to use permlane or readlane.
360 
361  // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
362  // 48..63).
363  assert(ST->hasPermLaneX16());
364  Value *const PermX = B.CreateIntrinsic(
365  Intrinsic::amdgcn_permlanex16, {},
366  {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
368  B, Op, V,
369  B.CreateCall(UpdateDPP,
370  {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
371  B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
372  if (!ST->isWave32()) {
373  // Combine lane 31 into lanes 32..63.
374  Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
375  {V, B.getInt32(31)});
377  B, Op, V,
378  B.CreateCall(UpdateDPP,
379  {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
380  B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
381  }
382  }
383  return V;
384 }
385 
386 // Use the builder to create a shift right of V across the wavefront, with all
387 // lanes active, to turn an inclusive scan into an exclusive scan.
388 Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
389  Value *const Identity) const {
390  Type *const Ty = V->getType();
391  Module *M = B.GetInsertBlock()->getModule();
392  Function *UpdateDPP =
393  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
394 
395  if (ST->hasDPPWavefrontShifts()) {
396  // GFX9 has DPP wavefront shift operations.
397  V = B.CreateCall(UpdateDPP,
398  {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
399  B.getInt32(0xf), B.getFalse()});
400  } else {
401  Function *ReadLane =
402  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
403  Function *WriteLane =
404  Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
405 
406  // On GFX10 all DPP operations are confined to a single row. To get cross-
407  // row operations we have to use permlane or readlane.
408  Value *Old = V;
409  V = B.CreateCall(UpdateDPP,
410  {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
411  B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
412 
413  // Copy the old lane 15 to the new lane 16.
414  V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
415  B.getInt32(16), V});
416 
417  if (!ST->isWave32()) {
418  // Copy the old lane 31 to the new lane 32.
419  V = B.CreateCall(
420  WriteLane,
421  {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
422 
423  // Copy the old lane 47 to the new lane 48.
424  V = B.CreateCall(
425  WriteLane,
426  {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
427  }
428  }
429 
430  return V;
431 }
432 
434  unsigned BitWidth) {
435  switch (Op) {
436  default:
437  llvm_unreachable("Unhandled atomic op");
438  case AtomicRMWInst::Add:
439  case AtomicRMWInst::Sub:
440  case AtomicRMWInst::Or:
441  case AtomicRMWInst::Xor:
442  case AtomicRMWInst::UMax:
444  case AtomicRMWInst::And:
445  case AtomicRMWInst::UMin:
447  case AtomicRMWInst::Max:
449  case AtomicRMWInst::Min:
451  }
452 }
453 
455  const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);
456  return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
457 }
458 
459 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
461  unsigned ValIdx,
462  bool ValDivergent) const {
463  // Start building just before the instruction.
464  IRBuilder<> B(&I);
465 
466  // If we are in a pixel shader, because of how we have to mask out helper
467  // lane invocations, we need to record the entry and exit BB's.
468  BasicBlock *PixelEntryBB = nullptr;
469  BasicBlock *PixelExitBB = nullptr;
470 
471  // If we're optimizing an atomic within a pixel shader, we need to wrap the
472  // entire atomic operation in a helper-lane check. We do not want any helper
473  // lanes that are around only for the purposes of derivatives to take part
474  // in any cross-lane communication, and we use a branch on whether the lane is
475  // live to do this.
476  if (IsPixelShader) {
477  // Record I's original position as the entry block.
478  PixelEntryBB = I.getParent();
479 
480  Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
481  Instruction *const NonHelperTerminator =
482  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
483 
484  // Record I's new position as the exit block.
485  PixelExitBB = I.getParent();
486 
487  I.moveBefore(NonHelperTerminator);
488  B.SetInsertPoint(&I);
489  }
490 
491  Type *const Ty = I.getType();
492  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
493  auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
494 
495  // This is the value in the atomic operation we need to combine in order to
496  // reduce the number of atomic operations.
497  Value *const V = I.getOperand(ValIdx);
498 
499  // We need to know how many lanes are active within the wavefront, and we do
500  // this by doing a ballot of active lanes.
501  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
502  CallInst *const Ballot =
503  B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
504 
505  // We need to know how many lanes are active within the wavefront that are
506  // below us. If we counted each lane linearly starting from 0, a lane is
507  // below us only if its associated index was less than ours. We do this by
508  // using the mbcnt intrinsic.
509  Value *Mbcnt;
510  if (ST->isWave32()) {
511  Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
512  {Ballot, B.getInt32(0)});
513  } else {
514  Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
515  Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
516  Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
517  Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
518  {ExtractLo, B.getInt32(0)});
519  Mbcnt =
520  B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
521  }
522  Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
523 
524  Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
525 
526  Value *ExclScan = nullptr;
527  Value *NewV = nullptr;
528 
529  const bool NeedResult = !I.use_empty();
530 
531  // If we have a divergent value in each lane, we need to combine the value
532  // using DPP.
533  if (ValDivergent) {
534  // First we need to set all inactive invocations to the identity value, so
535  // that they can correctly contribute to the final result.
536  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
537 
538  const AtomicRMWInst::BinOp ScanOp =
540  if (!NeedResult && ST->hasPermLaneX16()) {
541  // On GFX10 the permlanex16 instruction helps us build a reduction without
542  // too many readlanes and writelanes, which are generally bad for
543  // performance.
544  NewV = buildReduction(B, ScanOp, NewV, Identity);
545  } else {
546  NewV = buildScan(B, ScanOp, NewV, Identity);
547  if (NeedResult)
548  ExclScan = buildShiftRight(B, NewV, Identity);
549 
550  // Read the value from the last lane, which has accumulated the values of
551  // each active lane in the wavefront. This will be our new value which we
552  // will provide to the atomic operation.
553  Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
554  assert(TyBitWidth == 32);
555  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
556  {NewV, LastLaneIdx});
557  }
558 
559  // Finally mark the readlanes in the WWM section.
560  NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
561  } else {
562  switch (Op) {
563  default:
564  llvm_unreachable("Unhandled atomic op");
565 
566  case AtomicRMWInst::Add:
567  case AtomicRMWInst::Sub: {
568  // The new value we will be contributing to the atomic operation is the
569  // old value times the number of active lanes.
570  Value *const Ctpop = B.CreateIntCast(
571  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
572  NewV = buildMul(B, V, Ctpop);
573  break;
574  }
575 
576  case AtomicRMWInst::And:
577  case AtomicRMWInst::Or:
578  case AtomicRMWInst::Max:
579  case AtomicRMWInst::Min:
580  case AtomicRMWInst::UMax:
581  case AtomicRMWInst::UMin:
582  // These operations with a uniform value are idempotent: doing the atomic
583  // operation multiple times has the same effect as doing it once.
584  NewV = V;
585  break;
586 
587  case AtomicRMWInst::Xor:
588  // The new value we will be contributing to the atomic operation is the
589  // old value times the parity of the number of active lanes.
590  Value *const Ctpop = B.CreateIntCast(
591  B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);
592  NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));
593  break;
594  }
595  }
596 
597  // We only want a single lane to enter our new control flow, and we do this
598  // by checking if there are any active lanes below us. Only one lane will
599  // have 0 active lanes below us, so that will be the only one to progress.
600  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
601 
602  // Store I's original basic block before we split the block.
603  BasicBlock *const EntryBB = I.getParent();
604 
605  // We need to introduce some new control flow to force a single lane to be
606  // active. We do this by splitting I's basic block at I, and introducing the
607  // new block such that:
608  // entry --> single_lane -\
609  // \------------------> exit
610  Instruction *const SingleLaneTerminator =
611  SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
612 
613  // Move the IR builder into single_lane next.
614  B.SetInsertPoint(SingleLaneTerminator);
615 
616  // Clone the original atomic operation into single lane, replacing the
617  // original value with our newly created one.
618  Instruction *const NewI = I.clone();
619  B.Insert(NewI);
620  NewI->setOperand(ValIdx, NewV);
621 
622  // Move the IR builder into exit next, and start inserting just before the
623  // original instruction.
624  B.SetInsertPoint(&I);
625 
626  if (NeedResult) {
627  // Create a PHI node to get our new atomic result into the exit block.
628  PHINode *const PHI = B.CreatePHI(Ty, 2);
629  PHI->addIncoming(PoisonValue::get(Ty), EntryBB);
630  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
631 
632  // We need to broadcast the value who was the lowest active lane (the first
633  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
634  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
635  Value *BroadcastI = nullptr;
636 
637  if (TyBitWidth == 64) {
638  Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
639  Value *const ExtractHi =
640  B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
641  CallInst *const ReadFirstLaneLo =
642  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
643  CallInst *const ReadFirstLaneHi =
644  B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
645  Value *const PartialInsert = B.CreateInsertElement(
646  PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
647  Value *const Insert =
648  B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
649  BroadcastI = B.CreateBitCast(Insert, Ty);
650  } else if (TyBitWidth == 32) {
651 
652  BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
653  } else {
654  llvm_unreachable("Unhandled atomic bit width");
655  }
656 
657  // Now that we have the result of our single atomic operation, we need to
658  // get our individual lane's slice into the result. We use the lane offset
659  // we previously calculated combined with the atomic result value we got
660  // from the first lane, to get our lane's index into the atomic result.
661  Value *LaneOffset = nullptr;
662  if (ValDivergent) {
663  LaneOffset =
664  B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
665  } else {
666  switch (Op) {
667  default:
668  llvm_unreachable("Unhandled atomic op");
669  case AtomicRMWInst::Add:
670  case AtomicRMWInst::Sub:
671  LaneOffset = buildMul(B, V, Mbcnt);
672  break;
673  case AtomicRMWInst::And:
674  case AtomicRMWInst::Or:
675  case AtomicRMWInst::Max:
676  case AtomicRMWInst::Min:
677  case AtomicRMWInst::UMax:
678  case AtomicRMWInst::UMin:
679  LaneOffset = B.CreateSelect(Cond, Identity, V);
680  break;
681  case AtomicRMWInst::Xor:
682  LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
683  break;
684  }
685  }
686  Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
687 
688  if (IsPixelShader) {
689  // Need a final PHI to reconverge to above the helper lane branch mask.
690  B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
691 
692  PHINode *const PHI = B.CreatePHI(Ty, 2);
693  PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
694  PHI->addIncoming(Result, I.getParent());
695  I.replaceAllUsesWith(PHI);
696  } else {
697  // Replace the original atomic instruction with the new one.
698  I.replaceAllUsesWith(Result);
699  }
700  }
701 
702  // And delete the original.
703  I.eraseFromParent();
704 }
705 
706 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
707  "AMDGPU atomic optimizations", false, false)
710 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
712 
714  return new AMDGPUAtomicOptimizer();
715 }
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::AMDGPU::DPP::ROW_SHR0
@ ROW_SHR0
Definition: SIDefines.h:807
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::AArch64PACKey::ID
ID
Definition: AArch64BaseInfo.h:818
Insert
Vector Rotate Left Mask Mask Insert
Definition: README_P9.txt:112
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:101
llvm::Function
Definition: Function.h:60
llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition: Instructions.h:741
llvm::AtomicRMWInst::BinOp
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:727
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:376
llvm::APInt::getMinValue
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:196
llvm::APInt::getMaxValue
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:186
llvm::IRBuilder<>
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:746
llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:189
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
TargetMachine.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUAtomicOptimizer.cpp:27
GCNSubtarget.h
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::AMDGPUAtomicOptimizerID
char & AMDGPUAtomicOptimizerID
Definition: AMDGPUAtomicOptimizer.cpp:80
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:141
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:34
AMDGPU
Definition: AMDGPUReplaceLDSUseWithPointer.cpp:114
llvm::Instruction
Definition: Instruction.h:42
llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:306
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::BasicBlock::getFirstNonPHI
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:209
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::AtomicRMWInst::Add
@ Add
*p = old + v
Definition: Instructions.h:731
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:372
llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:84
llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:749
llvm::AArch64PACKey::DA
@ DA
Definition: AArch64BaseInfo.h:821
llvm::AtomicRMWInst::Sub
@ Sub
*p = old - v
Definition: Instructions.h:733
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::createAMDGPUAtomicOptimizerPass
FunctionPass * createAMDGPUAtomicOptimizerPass()
Definition: AMDGPUAtomicOptimizer.cpp:713
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:745
getIdentityValueForAtomicOp
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth)
Definition: AMDGPUAtomicOptimizer.cpp:433
llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition: Instructions.h:739
TargetPassConfig.h
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:748
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:138
AMDGPU.h
InstVisitor.h
llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:744
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::TargetPassConfig::getTM
TMC & getTM() const
Get the right type of TargetMachine for this target.
Definition: TargetPassConfig.h:151
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:78
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1481
llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:715
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:85
buildMul
static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)
Definition: AMDGPUAtomicOptimizer.cpp:454
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:351
llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition: Instructions.h:735
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::DominatorTreeWrapperPass::getDomTree
DominatorTree & getDomTree()
Definition: Dominators.h:314
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:199
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
optimizations
AMDGPU atomic optimizations
Definition: AMDGPUAtomicOptimizer.cpp:711
LegacyDivergenceAnalysis.h
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:742
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::PHINode
Definition: Instructions.h:2697
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:200
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::SplitBlockAndInsertIfThen
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights, DominatorTree *DT, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
Definition: BasicBlockUtils.cpp:1524
llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:747
llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
BasicBlockUtils.h
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::AMDGPU::DPP::WAVE_SHR1
@ WAVE_SHR1
Definition: SIDefines.h:820
buildNonAtomicBinOp
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
Definition: AMDGPUAtomicOptimizer.cpp:249
llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:743
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1732