doxygen/AMDGPUAtomicOptimizer_8cpp_source.html

//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file

/// This pass optimizes atomic operations by using a single lane of a wavefront

/// to perform the atomic operation, thus reducing contention on that memory

/// location.

/// Atomic optimizer uses following strategies to compute scan and reduced

/// values

/// 1. DPP -

///   This is the most efficient implementation for scan. DPP uses Whole Wave

///   Mode (WWM)

/// 2. Iterative -

//    An alternative implementation iterates over all active lanes

///   of Wavefront using llvm.cttz and performs scan  using readlane & writelane

///   intrinsics

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "llvm/Analysis/DomTreeUpdater.h"

#include "llvm/Analysis/UniformityAnalysis.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/InstVisitor.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/InitializePasses.h"

#include "llvm/Target/TargetMachine.h"

#include "llvm/Transforms/Utils/BasicBlockUtils.h"


#define DEBUG_TYPE "amdgpu-atomic-optimizer"


using namespace llvm;

using namespace llvm::AMDGPU;


namespace {


struct ReplacementInfo {

  Instruction *I;

  AtomicRMWInst::BinOp Op;

  unsigned ValIdx;

  bool ValDivergent;

};


class AMDGPUAtomicOptimizer : public FunctionPass {

public:

  static char ID;

  ScanOptions ScanImpl;

  AMDGPUAtomicOptimizer(ScanOptions ScanImpl)

      : FunctionPass(ID), ScanImpl(ScanImpl) {}


  bool runOnFunction(Function &F) override;


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.addPreserved<DominatorTreeWrapperPass>();

    AU.addRequired<UniformityInfoWrapperPass>();

    AU.addRequired<TargetPassConfig>();

  }

};


class AMDGPUAtomicOptimizerImpl

    : public InstVisitor<AMDGPUAtomicOptimizerImpl> {

private:

  SmallVector<ReplacementInfo, 8> ToReplace;

  const UniformityInfo *UA;

  const DataLayout *DL;

  DomTreeUpdater &DTU;

  const GCNSubtarget *ST;

  bool IsPixelShader;

  ScanOptions ScanImpl;


  Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,

                        Value *const Identity) const;

  Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,

                   Value *const Identity) const;

  Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;


  std::pair<Value *, Value *>

  buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,

                       Value *const Identity, Value *V, Instruction &I,

                       BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const;


  void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,

                      bool ValDivergent) const;


public:

  AMDGPUAtomicOptimizerImpl() = delete;


  AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,

                            DomTreeUpdater &DTU, const GCNSubtarget *ST,

                            bool IsPixelShader, ScanOptions ScanImpl)

      : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),

        ScanImpl(ScanImpl) {}


  bool run(Function &F);


  void visitAtomicRMWInst(AtomicRMWInst &I);

  void visitIntrinsicInst(IntrinsicInst &I);

};


} // namespace


char AMDGPUAtomicOptimizer::ID = 0;


char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;


bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {

  if (skipFunction(F)) {

    return false;

  }


  const UniformityInfo *UA =

      &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();

  const DataLayout *DL = &F.getParent()->getDataLayout();


  DominatorTreeWrapperPass *const DTW =

      getAnalysisIfAvailable<DominatorTreeWrapperPass>();

  DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,

                     DomTreeUpdater::UpdateStrategy::Lazy);


  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();

  const TargetMachine &TM = TPC.getTM<TargetMachine>();

  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);


  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;


  return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)

      .run(F);

}


PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,

                                                 FunctionAnalysisManager &AM) {


  const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);

  const DataLayout *DL = &F.getParent()->getDataLayout();


  DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),

                     DomTreeUpdater::UpdateStrategy::Lazy);

  const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);


  bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;


  bool IsChanged =

      AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)

          .run(F);


  if (!IsChanged) {

    return PreservedAnalyses::all();

  }


  PreservedAnalyses PA;

  PA.preserve<DominatorTreeAnalysis>();

  return PA;

}


bool AMDGPUAtomicOptimizerImpl::run(Function &F) {


  // Scan option None disables the Pass

  if (ScanImpl == ScanOptions::None) {

    return false;

  }


  visit(F);


  const bool Changed = !ToReplace.empty();


  for (ReplacementInfo &Info : ToReplace) {

    optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);

  }


  ToReplace.clear();


  return Changed;

}


void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {

  // Early exit for unhandled address space atomic instructions.

  switch (I.getPointerAddressSpace()) {

  default:

    return;

  case AMDGPUAS::GLOBAL_ADDRESS:

  case AMDGPUAS::LOCAL_ADDRESS:

    break;

  }


  AtomicRMWInst::BinOp Op = I.getOperation();


  switch (Op) {

  default:

    return;

  case AtomicRMWInst::Add:

  case AtomicRMWInst::Sub:

  case AtomicRMWInst::And:

  case AtomicRMWInst::Or:

  case AtomicRMWInst::Xor:

  case AtomicRMWInst::Max:

  case AtomicRMWInst::Min:

  case AtomicRMWInst::UMax:

  case AtomicRMWInst::UMin:

  case AtomicRMWInst::FAdd:

  case AtomicRMWInst::FSub:

  case AtomicRMWInst::FMax:

  case AtomicRMWInst::FMin:

    break;

  }


  // Only 32 and 64 bit floating point atomic ops are supported.

  if (AtomicRMWInst::isFPOperation(Op) &&

      !(I.getType()->isFloatTy() || I.getType()->isDoubleTy())) {

    return;

  }


  const unsigned PtrIdx = 0;

  const unsigned ValIdx = 1;


  // If the pointer operand is divergent, then each lane is doing an atomic

  // operation on a different address, and we cannot optimize that.

  if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {

    return;

  }


  const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));


  // If the value operand is divergent, each lane is contributing a different

  // value to the atomic calculation. We can only optimize divergent values if

  // we have DPP available on our subtarget, and the atomic operation is 32

  // bits.

  if (ValDivergent &&

      (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {

    return;

  }


  // If we get here, we can optimize the atomic using a single wavefront-wide

  // atomic operation to do the calculation for the entire wavefront, so

  // remember the instruction so we can come back to it.

  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};


  ToReplace.push_back(Info);

}


void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {

  AtomicRMWInst::BinOp Op;


  switch (I.getIntrinsicID()) {

  default:

    return;

  case Intrinsic::amdgcn_buffer_atomic_add:

  case Intrinsic::amdgcn_struct_buffer_atomic_add:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:

  case Intrinsic::amdgcn_raw_buffer_atomic_add:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:

    Op = AtomicRMWInst::Add;

    break;

  case Intrinsic::amdgcn_buffer_atomic_sub:

  case Intrinsic::amdgcn_struct_buffer_atomic_sub:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:

  case Intrinsic::amdgcn_raw_buffer_atomic_sub:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:

    Op = AtomicRMWInst::Sub;

    break;

  case Intrinsic::amdgcn_buffer_atomic_and:

  case Intrinsic::amdgcn_struct_buffer_atomic_and:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:

  case Intrinsic::amdgcn_raw_buffer_atomic_and:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:

    Op = AtomicRMWInst::And;

    break;

  case Intrinsic::amdgcn_buffer_atomic_or:

  case Intrinsic::amdgcn_struct_buffer_atomic_or:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:

  case Intrinsic::amdgcn_raw_buffer_atomic_or:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:

    Op = AtomicRMWInst::Or;

    break;

  case Intrinsic::amdgcn_buffer_atomic_xor:

  case Intrinsic::amdgcn_struct_buffer_atomic_xor:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:

  case Intrinsic::amdgcn_raw_buffer_atomic_xor:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:

    Op = AtomicRMWInst::Xor;

    break;

  case Intrinsic::amdgcn_buffer_atomic_smin:

  case Intrinsic::amdgcn_struct_buffer_atomic_smin:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:

  case Intrinsic::amdgcn_raw_buffer_atomic_smin:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:

    Op = AtomicRMWInst::Min;

    break;

  case Intrinsic::amdgcn_buffer_atomic_umin:

  case Intrinsic::amdgcn_struct_buffer_atomic_umin:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:

  case Intrinsic::amdgcn_raw_buffer_atomic_umin:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:

    Op = AtomicRMWInst::UMin;

    break;

  case Intrinsic::amdgcn_buffer_atomic_smax:

  case Intrinsic::amdgcn_struct_buffer_atomic_smax:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:

  case Intrinsic::amdgcn_raw_buffer_atomic_smax:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:

    Op = AtomicRMWInst::Max;

    break;

  case Intrinsic::amdgcn_buffer_atomic_umax:

  case Intrinsic::amdgcn_struct_buffer_atomic_umax:

  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:

  case Intrinsic::amdgcn_raw_buffer_atomic_umax:

  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:

    Op = AtomicRMWInst::UMax;

    break;

  }


  const unsigned ValIdx = 0;


  const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));


  // If the value operand is divergent, each lane is contributing a different

  // value to the atomic calculation. We can only optimize divergent values if

  // we have DPP available on our subtarget, and the atomic operation is 32

  // bits.

  if (ValDivergent &&

      (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {

    return;

  }


  // If any of the other arguments to the intrinsic are divergent, we can't

  // optimize the operation.

  for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {

    if (UA->isDivergentUse(I.getOperandUse(Idx))) {

      return;

    }

  }


  // If we get here, we can optimize the atomic using a single wavefront-wide

  // atomic operation to do the calculation for the entire wavefront, so

  // remember the instruction so we can come back to it.

  const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};


  ToReplace.push_back(Info);

}


// Use the builder to create the non-atomic counterpart of the specified

// atomicrmw binary op.

static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,

                                  Value *LHS, Value *RHS) {

  CmpInst::Predicate Pred;


  switch (Op) {

  default:

    llvm_unreachable("Unhandled atomic op");

  case AtomicRMWInst::Add:

    return B.CreateBinOp(Instruction::Add, LHS, RHS);

  case AtomicRMWInst::FAdd:

    return B.CreateFAdd(LHS, RHS);

  case AtomicRMWInst::Sub:

    return B.CreateBinOp(Instruction::Sub, LHS, RHS);

  case AtomicRMWInst::FSub:

    return B.CreateFSub(LHS, RHS);

  case AtomicRMWInst::And:

    return B.CreateBinOp(Instruction::And, LHS, RHS);

  case AtomicRMWInst::Or:

    return B.CreateBinOp(Instruction::Or, LHS, RHS);

  case AtomicRMWInst::Xor:

    return B.CreateBinOp(Instruction::Xor, LHS, RHS);


  case AtomicRMWInst::Max:

    Pred = CmpInst::ICMP_SGT;

    break;

  case AtomicRMWInst::Min:

    Pred = CmpInst::ICMP_SLT;

    break;

  case AtomicRMWInst::UMax:

    Pred = CmpInst::ICMP_UGT;

    break;

  case AtomicRMWInst::UMin:

    Pred = CmpInst::ICMP_ULT;

    break;

  case AtomicRMWInst::FMax:

    return B.CreateMaxNum(LHS, RHS);

  case AtomicRMWInst::FMin:

    return B.CreateMinNum(LHS, RHS);

  }

  Value *Cond = B.CreateICmp(Pred, LHS, RHS);

  return B.CreateSelect(Cond, LHS, RHS);

}


// Use the builder to create a reduction of V across the wavefront, with all

// lanes active, returning the same result in all lanes.

Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,

                                                 AtomicRMWInst::BinOp Op,

                                                 Value *V,

                                                 Value *const Identity) const {

  Type *AtomicTy = V->getType();

  Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());

  Module *M = B.GetInsertBlock()->getModule();

  Function *UpdateDPP =

      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);


  // Reduce within each row of 16 lanes.

  for (unsigned Idx = 0; Idx < 4; Idx++) {

    V = buildNonAtomicBinOp(

        B, Op, V,

        B.CreateCall(UpdateDPP,

                     {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),

                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));

  }


  // Reduce within each pair of rows (i.e. 32 lanes).

  assert(ST->hasPermLaneX16());

  V = B.CreateBitCast(V, IntNTy);

  Value *Permlanex16Call = B.CreateIntrinsic(

      Intrinsic::amdgcn_permlanex16, {},

      {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});

  V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),

                          B.CreateBitCast(Permlanex16Call, AtomicTy));

  if (ST->isWave32()) {

    return V;

  }


  if (ST->hasPermLane64()) {

    // Reduce across the upper and lower 32 lanes.

    V = B.CreateBitCast(V, IntNTy);

    Value *Permlane64Call =

        B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);

    return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),

                               B.CreateBitCast(Permlane64Call, AtomicTy));

  }


  // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and

  // combine them with a scalar operation.

  Function *ReadLane =

      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});

  V = B.CreateBitCast(V, IntNTy);

  Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});

  Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});

  return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),

                             B.CreateBitCast(Lane32, AtomicTy));

}


// Use the builder to create an inclusive scan of V across the wavefront, with

// all lanes active.

Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,

                                            AtomicRMWInst::BinOp Op, Value *V,

                                            Value *Identity) const {

  Type *AtomicTy = V->getType();

  Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());


  Module *M = B.GetInsertBlock()->getModule();

  Function *UpdateDPP =

      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);


  for (unsigned Idx = 0; Idx < 4; Idx++) {

    V = buildNonAtomicBinOp(

        B, Op, V,

        B.CreateCall(UpdateDPP,

                     {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),

                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));

  }

  if (ST->hasDPPBroadcasts()) {

    // GFX9 has DPP row broadcast operations.

    V = buildNonAtomicBinOp(

        B, Op, V,

        B.CreateCall(UpdateDPP,

                     {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),

                      B.getInt32(0xf), B.getFalse()}));

    V = buildNonAtomicBinOp(

        B, Op, V,

        B.CreateCall(UpdateDPP,

                     {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),

                      B.getInt32(0xf), B.getFalse()}));

  } else {

    // On GFX10 all DPP operations are confined to a single row. To get cross-

    // row operations we have to use permlane or readlane.


    // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes

    // 48..63).

    assert(ST->hasPermLaneX16());

    V = B.CreateBitCast(V, IntNTy);

    Value *PermX = B.CreateIntrinsic(

        Intrinsic::amdgcn_permlanex16, {},

        {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});


    Value *UpdateDPPCall =

        B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),

                                 B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),

                                 B.getInt32(0xf), B.getFalse()});

    V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);


    if (!ST->isWave32()) {

      // Combine lane 31 into lanes 32..63.

      V = B.CreateBitCast(V, IntNTy);

      Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},

                                              {V, B.getInt32(31)});


      Value *UpdateDPPCall = B.CreateCall(

          UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),

                      B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});


      V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),

                              UpdateDPPCall);

    }

  }

  return V;

}


// Use the builder to create a shift right of V across the wavefront, with all

// lanes active, to turn an inclusive scan into an exclusive scan.

Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,

                                                  Value *Identity) const {

  Type *AtomicTy = V->getType();

  Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());


  Module *M = B.GetInsertBlock()->getModule();

  Function *UpdateDPP =

      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);

  if (ST->hasDPPWavefrontShifts()) {

    // GFX9 has DPP wavefront shift operations.

    V = B.CreateCall(UpdateDPP,

                     {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),

                      B.getInt32(0xf), B.getFalse()});

  } else {

    Function *ReadLane =

        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});

    Function *WriteLane =

        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});


    // On GFX10 all DPP operations are confined to a single row. To get cross-

    // row operations we have to use permlane or readlane.

    Value *Old = V;

    V = B.CreateCall(UpdateDPP,

                     {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),

                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});


    // Copy the old lane 15 to the new lane 16.

    V = B.CreateCall(

        WriteLane,

        {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),

         B.getInt32(16), B.CreateBitCast(V, IntNTy)});

    V = B.CreateBitCast(V, AtomicTy);

    if (!ST->isWave32()) {

      // Copy the old lane 31 to the new lane 32.

      V = B.CreateBitCast(V, IntNTy);

      V = B.CreateCall(WriteLane,

                       {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),

                                                B.getInt32(31)}),

                        B.getInt32(32), V});


      // Copy the old lane 47 to the new lane 48.

      V = B.CreateCall(

          WriteLane,

          {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});

      V = B.CreateBitCast(V, AtomicTy);

    }

  }


  return V;

}


// Use the builder to create an exclusive scan and compute the final reduced

// value using an iterative approach. This provides an alternative

// implementation to DPP which uses WMM for scan computations. This API iterate

// over active lanes to read, compute and update the value using

// readlane and writelane intrinsics.

std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(

    IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,

    Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {

  auto *Ty = I.getType();

  auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());

  auto *EntryBB = I.getParent();

  auto NeedResult = !I.use_empty();


  auto *Ballot =

      B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());


  // Start inserting instructions for ComputeLoop block

  B.SetInsertPoint(ComputeLoop);

  // Phi nodes for Accumulator, Scan results destination, and Active Lanes

  auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator");

  Accumulator->addIncoming(Identity, EntryBB);

  PHINode *OldValuePhi = nullptr;

  if (NeedResult) {

    OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi");

    OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB);

  }

  auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits");

  ActiveBits->addIncoming(Ballot, EntryBB);


  // Use llvm.cttz instrinsic to find the lowest remaining active lane.

  auto *FF1 =

      B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});


  Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());

  auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);


  // Get the value required for atomic operation

  V = B.CreateBitCast(V, IntNTy);

  Value *LaneValue =

      B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});

  LaneValue = B.CreateBitCast(LaneValue, Ty);


  // Perform writelane if intermediate scan results are required later in the

  // kernel computations

  Value *OldValue = nullptr;

  if (NeedResult) {

    OldValue =

        B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},

                          {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,

                           B.CreateBitCast(OldValuePhi, IntNTy)});

    OldValue = B.CreateBitCast(OldValue, Ty);

    OldValuePhi->addIncoming(OldValue, ComputeLoop);

  }


  // Accumulate the results

  auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue);

  Accumulator->addIncoming(NewAccumulator, ComputeLoop);


  // Set bit to zero of current active lane so that for next iteration llvm.cttz

  // return the next active lane

  auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);


  auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));

  auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);

  ActiveBits->addIncoming(NewActiveBits, ComputeLoop);


  // Branch out of the loop when all lanes are processed.

  auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));

  B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);


  B.SetInsertPoint(ComputeEnd);


  return {OldValue, NewAccumulator};

}


static Constant *getIdentityValueForAtomicOp(Type *const Ty,

                                             AtomicRMWInst::BinOp Op) {

  LLVMContext &C = Ty->getContext();

  const unsigned BitWidth = Ty->getPrimitiveSizeInBits();

  switch (Op) {

  default:

    llvm_unreachable("Unhandled atomic op");

  case AtomicRMWInst::Add:

  case AtomicRMWInst::Sub:

  case AtomicRMWInst::Or:

  case AtomicRMWInst::Xor:

  case AtomicRMWInst::UMax:

    return ConstantInt::get(C, APInt::getMinValue(BitWidth));

  case AtomicRMWInst::And:

  case AtomicRMWInst::UMin:

    return ConstantInt::get(C, APInt::getMaxValue(BitWidth));

  case AtomicRMWInst::Max:

    return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth));

  case AtomicRMWInst::Min:

    return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth));

  case AtomicRMWInst::FAdd:

    return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true));

  case AtomicRMWInst::FSub:

    return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false));

  case AtomicRMWInst::FMin:

    return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), false));

  case AtomicRMWInst::FMax:

    return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), true));

  }

}


static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {

  const ConstantInt *CI = dyn_cast<ConstantInt>(LHS);

  return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);

}


void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,

                                               AtomicRMWInst::BinOp Op,

                                               unsigned ValIdx,

                                               bool ValDivergent) const {

  // Start building just before the instruction.

  IRBuilder<> B(&I);


  if (AtomicRMWInst::isFPOperation(Op)) {

    B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP));

  }


  // If we are in a pixel shader, because of how we have to mask out helper

  // lane invocations, we need to record the entry and exit BB's.

  BasicBlock *PixelEntryBB = nullptr;

  BasicBlock *PixelExitBB = nullptr;


  // If we're optimizing an atomic within a pixel shader, we need to wrap the

  // entire atomic operation in a helper-lane check. We do not want any helper

  // lanes that are around only for the purposes of derivatives to take part

  // in any cross-lane communication, and we use a branch on whether the lane is

  // live to do this.

  if (IsPixelShader) {

    // Record I's original position as the entry block.

    PixelEntryBB = I.getParent();


    Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});

    Instruction *const NonHelperTerminator =

        SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);


    // Record I's new position as the exit block.

    PixelExitBB = I.getParent();


    I.moveBefore(NonHelperTerminator);

    B.SetInsertPoint(&I);

  }


  Type *const Ty = I.getType();

  Type *Int32Ty = B.getInt32Ty();

  Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());

  bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();

  const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);

  auto *const VecTy = FixedVectorType::get(Int32Ty, 2);


  // This is the value in the atomic operation we need to combine in order to

  // reduce the number of atomic operations.

  Value *V = I.getOperand(ValIdx);


  // We need to know how many lanes are active within the wavefront, and we do

  // this by doing a ballot of active lanes.

  Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());

  CallInst *const Ballot =

      B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());


  // We need to know how many lanes are active within the wavefront that are

  // below us. If we counted each lane linearly starting from 0, a lane is

  // below us only if its associated index was less than ours. We do this by

  // using the mbcnt intrinsic.

  Value *Mbcnt;

  if (ST->isWave32()) {

    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},

                              {Ballot, B.getInt32(0)});

  } else {

    Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);

    Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);

    Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},

                              {ExtractLo, B.getInt32(0)});

    Mbcnt =

        B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});

  }


  Function *F = I.getFunction();

  LLVMContext &C = F->getContext();


  // For atomic sub, perform scan with add operation and allow one lane to

  // subtract the reduced value later.

  AtomicRMWInst::BinOp ScanOp = Op;

  if (Op == AtomicRMWInst::Sub) {

    ScanOp = AtomicRMWInst::Add;

  } else if (Op == AtomicRMWInst::FSub) {

    ScanOp = AtomicRMWInst::FAdd;

  }

  Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);


  Value *ExclScan = nullptr;

  Value *NewV = nullptr;


  const bool NeedResult = !I.use_empty();


  BasicBlock *ComputeLoop = nullptr;

  BasicBlock *ComputeEnd = nullptr;

  // If we have a divergent value in each lane, we need to combine the value

  // using DPP.

  if (ValDivergent) {

    if (ScanImpl == ScanOptions::DPP) {

      // First we need to set all inactive invocations to the identity value, so

      // that they can correctly contribute to the final result.

      V = B.CreateBitCast(V, IntNTy);

      Identity = B.CreateBitCast(Identity, IntNTy);

      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,

                               {V, Identity});

      NewV = B.CreateBitCast(NewV, Ty);

      V = B.CreateBitCast(V, Ty);

      Identity = B.CreateBitCast(Identity, Ty);

      if (!NeedResult && ST->hasPermLaneX16()) {

        // On GFX10 the permlanex16 instruction helps us build a reduction

        // without too many readlanes and writelanes, which are generally bad

        // for performance.

        NewV = buildReduction(B, ScanOp, NewV, Identity);

      } else {

        NewV = buildScan(B, ScanOp, NewV, Identity);

        if (NeedResult)

          ExclScan = buildShiftRight(B, NewV, Identity);

        // Read the value from the last lane, which has accumulated the values

        // of each active lane in the wavefront. This will be our new value

        // which we will provide to the atomic operation.

        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);

        assert(TyBitWidth == 32);

        NewV = B.CreateBitCast(NewV, IntNTy);

        NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},

                                 {NewV, LastLaneIdx});

        NewV = B.CreateBitCast(NewV, Ty);

      }

      // Finally mark the readlanes in the WWM section.

      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);

    } else if (ScanImpl == ScanOptions::Iterative) {

      // Alternative implementation for scan

      ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);

      ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);

      std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,

                                                      ComputeLoop, ComputeEnd);

    } else {

      llvm_unreachable("Atomic Optimzer is disabled for None strategy");

    }

  } else {

    switch (Op) {

    default:

      llvm_unreachable("Unhandled atomic op");


    case AtomicRMWInst::Add:

    case AtomicRMWInst::Sub: {

      // The new value we will be contributing to the atomic operation is the

      // old value times the number of active lanes.

      Value *const Ctpop = B.CreateIntCast(

          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);

      NewV = buildMul(B, V, Ctpop);

      break;

    }

    case AtomicRMWInst::FAdd:

    case AtomicRMWInst::FSub: {

      Value *const Ctpop = B.CreateIntCast(

          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);

      Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);

      NewV = B.CreateFMul(V, CtpopFP);

      break;

    }

    case AtomicRMWInst::And:

    case AtomicRMWInst::Or:

    case AtomicRMWInst::Max:

    case AtomicRMWInst::Min:

    case AtomicRMWInst::UMax:

    case AtomicRMWInst::UMin:

    case AtomicRMWInst::FMin:

    case AtomicRMWInst::FMax:

      // These operations with a uniform value are idempotent: doing the atomic

      // operation multiple times has the same effect as doing it once.

      NewV = V;

      break;


    case AtomicRMWInst::Xor:

      // The new value we will be contributing to the atomic operation is the

      // old value times the parity of the number of active lanes.

      Value *const Ctpop = B.CreateIntCast(

          B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false);

      NewV = buildMul(B, V, B.CreateAnd(Ctpop, 1));

      break;

    }

  }


  // We only want a single lane to enter our new control flow, and we do this

  // by checking if there are any active lanes below us. Only one lane will

  // have 0 active lanes below us, so that will be the only one to progress.

  Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));


  // Store I's original basic block before we split the block.

  BasicBlock *const OriginalBB = I.getParent();


  // We need to introduce some new control flow to force a single lane to be

  // active. We do this by splitting I's basic block at I, and introducing the

  // new block such that:

  // entry --> single_lane -\

  //       \------------------> exit

  Instruction *const SingleLaneTerminator =

      SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);


  // At this point, we have split the I's block to allow one lane in wavefront

  // to update the precomputed reduced value. Also, completed the codegen for

  // new control flow i.e. iterative loop which perform reduction and scan using

  // ComputeLoop and ComputeEnd.

  // For the new control flow, we need to move branch instruction i.e.

  // terminator created during SplitBlockAndInsertIfThen from I's block to

  // ComputeEnd block. We also need to set up predecessor to next block when

  // single lane done updating the final reduced value.

  BasicBlock *Predecessor = nullptr;

  if (ValDivergent && ScanImpl == ScanOptions::Iterative) {

    // Move terminator from I's block to ComputeEnd block.

    //

    // OriginalBB is known to have a branch as terminator because

    // SplitBlockAndInsertIfThen will have inserted one.

    BranchInst *Terminator = cast<BranchInst>(OriginalBB->getTerminator());

    B.SetInsertPoint(ComputeEnd);

    Terminator->removeFromParent();

    B.Insert(Terminator);


    // Branch to ComputeLoop Block unconditionally from the I's block for

    // iterative approach.

    B.SetInsertPoint(OriginalBB);

    B.CreateBr(ComputeLoop);


    // Update the dominator tree for new control flow.

    SmallVector<DominatorTree::UpdateType, 6> DomTreeUpdates(

        {{DominatorTree::Insert, OriginalBB, ComputeLoop},

         {DominatorTree::Insert, ComputeLoop, ComputeEnd}});


    // We're moving the terminator from EntryBB to ComputeEnd, make sure we move

    // the DT edges as well.

    for (auto *Succ : Terminator->successors()) {

      DomTreeUpdates.push_back({DominatorTree::Insert, ComputeEnd, Succ});

      DomTreeUpdates.push_back({DominatorTree::Delete, OriginalBB, Succ});

    }


    DTU.applyUpdates(DomTreeUpdates);


    Predecessor = ComputeEnd;

  } else {

    Predecessor = OriginalBB;

  }

  // Move the IR builder into single_lane next.

  B.SetInsertPoint(SingleLaneTerminator);


  // Clone the original atomic operation into single lane, replacing the

  // original value with our newly created one.

  Instruction *const NewI = I.clone();

  B.Insert(NewI);

  NewI->setOperand(ValIdx, NewV);


  // Move the IR builder into exit next, and start inserting just before the

  // original instruction.

  B.SetInsertPoint(&I);


  if (NeedResult) {

    // Create a PHI node to get our new atomic result into the exit block.

    PHINode *const PHI = B.CreatePHI(Ty, 2);

    PHI->addIncoming(PoisonValue::get(Ty), Predecessor);

    PHI->addIncoming(NewI, SingleLaneTerminator->getParent());


    // We need to broadcast the value who was the lowest active lane (the first

    // lane) to all other lanes in the wavefront. We use an intrinsic for this,

    // but have to handle 64-bit broadcasts with two calls to this intrinsic.

    Value *BroadcastI = nullptr;


    if (TyBitWidth == 64) {

      Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);

      Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);

      Value *const ExtractHi =

          B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);

      CallInst *const ReadFirstLaneLo =

          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);

      CallInst *const ReadFirstLaneHi =

          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);

      Value *const PartialInsert = B.CreateInsertElement(

          PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));

      Value *const Insert =

          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));

      BroadcastI = B.CreateBitCast(Insert, Ty);

    } else if (TyBitWidth == 32) {

      Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);

      BroadcastI =

          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi);

      BroadcastI = B.CreateBitCast(BroadcastI, Ty);


    } else {

      llvm_unreachable("Unhandled atomic bit width");

    }


    // Now that we have the result of our single atomic operation, we need to

    // get our individual lane's slice into the result. We use the lane offset

    // we previously calculated combined with the atomic result value we got

    // from the first lane, to get our lane's index into the atomic result.

    Value *LaneOffset = nullptr;

    if (ValDivergent) {

      if (ScanImpl == ScanOptions::DPP) {

        LaneOffset =

            B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);

      } else if (ScanImpl == ScanOptions::Iterative) {

        LaneOffset = ExclScan;

      } else {

        llvm_unreachable("Atomic Optimzer is disabled for None strategy");

      }

    } else {

      Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)

                                      : B.CreateIntCast(Mbcnt, Ty, false);

      switch (Op) {

      default:

        llvm_unreachable("Unhandled atomic op");

      case AtomicRMWInst::Add:

      case AtomicRMWInst::Sub:

        LaneOffset = buildMul(B, V, Mbcnt);

        break;

      case AtomicRMWInst::And:

      case AtomicRMWInst::Or:

      case AtomicRMWInst::Max:

      case AtomicRMWInst::Min:

      case AtomicRMWInst::UMax:

      case AtomicRMWInst::UMin:

      case AtomicRMWInst::FMin:

      case AtomicRMWInst::FMax:

        LaneOffset = B.CreateSelect(Cond, Identity, V);

        break;

      case AtomicRMWInst::Xor:

        LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));

        break;

      case AtomicRMWInst::FAdd:

      case AtomicRMWInst::FSub: {

        LaneOffset = B.CreateFMul(V, Mbcnt);

        break;

      }

      }

    }

    Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);


    if (IsPixelShader) {

      // Need a final PHI to reconverge to above the helper lane branch mask.

      B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());


      PHINode *const PHI = B.CreatePHI(Ty, 2);

      PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);

      PHI->addIncoming(Result, I.getParent());

      I.replaceAllUsesWith(PHI);

    } else {

      // Replace the original atomic instruction with the new one.

      I.replaceAllUsesWith(Result);

    }

  }


  // And delete the original.

  I.eraseFromParent();

}


INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,

                      "AMDGPU atomic optimizations", false, false)

INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)

INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)

INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,

                    "AMDGPU atomic optimizations", false, false)


FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) {

  return new AMDGPUAtomicOptimizer(ScanStrategy);

}

FMAInstKind::Accumulator
@ Accumulator

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:74

getIdentityValueForAtomicOp
static Constant * getIdentityValueForAtomicOp(Type *const Ty, AtomicRMWInst::BinOp Op)
Definition: AMDGPUAtomicOptimizer.cpp:638

buildMul
static Value * buildMul(IRBuilder<> &B, Value *LHS, Value *RHS)
Definition: AMDGPUAtomicOptimizer.cpp:669

buildNonAtomicBinOp
static Value * buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *LHS, Value *RHS)
Definition: AMDGPUAtomicOptimizer.cpp:348

PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:100

AMDGPU.h

BasicBlockUtils.h

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27

Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition: DeadArgumentElimination.cpp:354

DomTreeUpdater.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

DEBUG_TYPE
#define DEBUG_TYPE
Definition: GenericCycleImpl.h:30

IRBuilder.h

InitializePasses.h

InstVisitor.h

optimizations
Generic memory optimizations
Definition: LoadStoreOpt.cpp:55

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

Int32Ty
IntegerType * Int32Ty
Definition: NVVMIntrRange.cpp:67

TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition: RISCVRedundantCopyElimination.cpp:75

assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

UniformityAnalysis.h
LLVM IR instance of the generic uniformity analysis.

RHS
Value * RHS
Definition: X86PartialReduction.cpp:76

LHS
Value * LHS
Definition: X86PartialReduction.cpp:75

llvm::APFloat::getInf
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966

llvm::APFloat::getZero
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957

llvm::APInt::getMaxValue
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:184

llvm::APInt::getSignedMaxValue
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187

llvm::APInt::getMinValue
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:194

llvm::APInt::getSignedMinValue
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197

llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321

llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75

llvm::AnalysisUsage::addPreserved
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
Definition: PassAnalysisSupport.h:98

llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748

llvm::AtomicRMWInst::isFPOperation
static bool isFPOperation(BinOp Op)
Definition: Instructions.h:849

llvm::AtomicRMWInst::BinOp
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760

llvm::AtomicRMWInst::Add
@ Add
*p = old + v
Definition: Instructions.h:764

llvm::AtomicRMWInst::FAdd
@ FAdd
*p = old + v
Definition: Instructions.h:785

llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778

llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition: Instructions.h:772

llvm::AtomicRMWInst::Sub
@ Sub
*p = old - v
Definition: Instructions.h:766

llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition: Instructions.h:768

llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition: Instructions.h:774

llvm::AtomicRMWInst::FSub
@ FSub
*p = old - v
Definition: Instructions.h:788

llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776

llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782

llvm::AtomicRMWInst::FMin
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796

llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780

llvm::AtomicRMWInst::FMax
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792

llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:60

llvm::BasicBlock::getFirstNonPHIIt
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367

llvm::BasicBlock::Create
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199

llvm::BasicBlock::getTerminator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221

llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3439

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1565

llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993

llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022

llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016

llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020

llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:80

llvm::ConstantInt::isOne
bool isOne() const
This is just a convenience method to make client code smaller for a common case.
Definition: Constants.h:211

llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:32

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110

llvm::DomTreeUpdater
Definition: DomTreeUpdater.h:28

llvm::DomTreeUpdater::UpdateStrategy::Lazy
@ Lazy

llvm::DominatorTreeAnalysis
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279

llvm::DominatorTreeBase< BasicBlock, false >::Delete
static constexpr UpdateKind Delete
Definition: GenericDomTree.h:259

llvm::DominatorTreeBase< BasicBlock, false >::Insert
static constexpr UpdateKind Insert
Definition: GenericDomTree.h:258

llvm::DominatorTreeWrapperPass
Legacy analysis pass which computes a DominatorTree.
Definition: Dominators.h:317

llvm::DominatorTreeWrapperPass::getDomTree
DominatorTree & getDomTree()
Definition: Dominators.h:325

llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311

llvm::FunctionPass::runOnFunction
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.

llvm::Function
Definition: Function.h:62

llvm::GCNSubtarget
Definition: GCNSubtarget.h:35

llvm::GenericUniformityInfo< SSAContext >

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666

llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:78

llvm::InstVisitor::visitIntrinsicInst
RetTy visitIntrinsicInst(IntrinsicInst &I)
Definition: InstVisitor.h:219

llvm::InstVisitor::visitAtomicRMWInst
RetTy visitAtomicRMWInst(AtomicRMWInst &I)
Definition: InstVisitor.h:172

llvm::Instruction
Definition: Instruction.h:49

llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:152

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65

llvm::PHINode
Definition: Instructions.h:2973

llvm::PHINode::addIncoming
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Definition: Instructions.h:3134

llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98

llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115

llvm::PreservedAnalyses::preserve
void preserve()
Mark an analysis as preserved.
Definition: Analysis.h:129

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:426

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76

llvm::TargetMachine::getSubtarget
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Definition: TargetMachine.h:170

llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition: TargetPassConfig.h:85

llvm::TargetPassConfig::getTM
TMC & getTM() const
Get the right type of TargetMachine for this target.
Definition: TargetPassConfig.h:157

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45

llvm::Type::getFltSemantics
const fltSemantics & getFltSemantics() const

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129

llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185

llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.

llvm::UniformityInfoAnalysis
Analysis pass which computes UniformityInfo.
Definition: UniformityAnalysis.h:27

llvm::UniformityInfoWrapperPass
Legacy analysis pass which computes a CycleInfo.
Definition: UniformityAnalysis.h:55

llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174

llvm::Value
LLVM Value Representation.
Definition: Value.h:74

unsigned

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143

TargetMachine.h

false
Definition: StackSlotColoring.cpp:184

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPUAddrSpace.h:35

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPUAddrSpace.h:31

llvm::AMDGPU::DPP::QUAD_PERM_ID
@ QUAD_PERM_ID
Definition: SIDefines.h:919

llvm::AMDGPU::DPP::WAVE_SHR1
@ WAVE_SHR1
Definition: SIDefines.h:939

llvm::AMDGPU::DPP::ROW_SHR0
@ ROW_SHR0
Definition: SIDefines.h:926

llvm::AMDGPU
Definition: AMDGPUMetadataVerifier.h:33

llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73

llvm::ARM::ProfileKind::M
@ M

llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24

llvm::CallingConv::AMDGPU_PS
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34

llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1465

llvm::M68k::MemAddrModeKind::V
@ V

llvm::MCID::Terminator
@ Terminator
Definition: MCInstrDesc.h:158

llvm::cfg::UpdateKind::Insert
@ Insert

llvm::ms_demangle::QualifierMangleMode::Result
@ Result

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::ScanOptions
ScanOptions
Definition: AMDGPU.h:99

llvm::ScanOptions::DPP
@ DPP

llvm::ScanOptions::None
@ None

llvm::ScanOptions::Iterative
@ Iterative

llvm::createAMDGPUAtomicOptimizerPass
FunctionPass * createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy)
Definition: AMDGPUAtomicOptimizer.cpp:1029

llvm::Op
DWARFExpression::Operation Op
Definition: DWARFExpression.cpp:22

llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191

llvm::AMDGPUAtomicOptimizerID
char & AMDGPUAtomicOptimizerID
Definition: AMDGPUAtomicOptimizer.cpp:110

llvm::SplitBlockAndInsertIfThen
Instruction * SplitBlockAndInsertIfThen(Value *Cond, BasicBlock::iterator SplitBefore, bool Unreachable, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
Definition: BasicBlockUtils.cpp:1607

llvm::AMDGPUAtomicOptimizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: AMDGPUAtomicOptimizer.cpp:136