doxygen/AMDGPUImageIntrinsicOptimizer_8cpp_source.html

//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa

// or dim=2darraymsaa into a single image_msaa_load intrinsic if:

//

// - they refer to the same vaddr except for sample_id,

// - they use a constant sample_id and they fall into the same group,

// - they have the same dmask and the number of intrinsics and the number of

//   vaddr/vdata dword transfers is reduced by the combine.

//

// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):

//

// +----------+-----+-----+-------+---------+------------+---------+----------+

// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |

// |  (dmask) |     |     |       | vdata   |            | vdata   |          |

// +----------+-----+-----+-------+---------+------------+---------+----------+

// |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |

// +----------+-----+-----+-------+---------+------------+---------+----------+

// |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |

// +----------+-----+-----+-------+---------+------------+---------+----------+

// |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |

// +----------+-----+-----+-------+---------+------------+---------+----------+

// |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |

// +----------+-----+-----+-------+---------+------------+---------+----------+

// |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |

// +----------+-----+-----+-------+---------+------------+---------+----------+

//

// Some cases are of questionable benefit, like the one marked with "yes?"

// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP

// and TX, but higher vdata. We start by erring on the side of converting these

// to MSAA_LOAD.

//

// clang-format off

//

// This pass will combine intrinsics such as (not neccessarily consecutive):

//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)

//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)

//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)

//  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)

// ==>

//  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)

//

// clang-format on

//

// Future improvements:

//

// - We may occasionally not want to do the combine if it increases the maximum

//   register pressure.

//

// - Ensure clausing when multiple MSAA_LOAD are generated.

//

// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this

// combine only applies to gfx11, due to a limitation in gfx10: the gfx10

// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and

// we don't know the format at compile time.

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "AMDGPUInstrInfo.h"

#include "AMDGPUTargetMachine.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/Pass.h"

#include "llvm/Support/raw_ostream.h"


using namespace llvm;


#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"


namespace {

class AMDGPUImageIntrinsicOptimizer : public FunctionPass {

  const TargetMachine *TM;


public:

  static char ID;


  AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)

      : FunctionPass(ID), TM(TM) {}


  bool runOnFunction(Function &F) override;


}; // End of class AMDGPUImageIntrinsicOptimizer

} // End anonymous namespace


INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,

                "AMDGPU Image Intrinsic Optimizer", false, false)


char AMDGPUImageIntrinsicOptimizer::ID = 0;


void addInstToMergeableList(

    IntrinsicInst *II,

    SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,

    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {

  for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {

    // Check Dim.

    if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())

      continue;


    // Check D16.

    if (IIList.front()->getType() != II->getType())

      continue;


    // Check all arguments (DMask, VAddr, RSrc etc).

    bool AllEqual = true;

    assert(IIList.front()->arg_size() == II->arg_size());

    for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {

      Value *ArgList = IIList.front()->getArgOperand(I);

      Value *Arg = II->getArgOperand(I);

      if (I == ImageDimIntr->VAddrEnd - 1) {

        // Check FragId group.

        auto *FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));

        auto *FragId = cast<ConstantInt>(II->getArgOperand(I));

        AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);

      } else {

        // Check all arguments except FragId.

        AllEqual = ArgList == Arg;

      }

    }

    if (!AllEqual)

      continue;


    // Add to the list.

    IIList.emplace_back(II);

    return;

  }


  // Similar instruction not found, so add a new list.

  MergeableInsts.emplace_back(1, II);

  LLVM_DEBUG(dbgs() << "New: " << *II << "\n");

}


// Collect list of all instructions we know how to merge in a subset of the

// block. It returns an iterator to the instruction after the last one analyzed.


BasicBlock::iterator collectMergeableInsts(

    BasicBlock::iterator I, BasicBlock::iterator E,

    SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {

  for (; I != E; ++I) {

    // Don't combine if there is a store in the middle or if there is a memory

    // barrier.

    if (I->mayHaveSideEffects()) {

      ++I;

      break;

    }


    // Ignore non-intrinsics.

    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {

      Intrinsic::ID IntrinID = II->getIntrinsicID();


      // Ignore other intrinsics.

      if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&

          IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)

        continue;


      // Check for constant FragId.

      const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);

      const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;

      if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))

        continue;


      LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");

      addInstToMergeableList(II, MergeableInsts, ImageDimIntr);

    }

  }


  return I;

}


bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {

  bool Modified = false;


  SmallVector<Instruction *, 4> InstrsToErase;

  for (const auto &IIList : MergeableInsts) {

    if (IIList.size() <= 1)

      continue;


    // Assume the arguments are unchanged and later override them, if needed.

    SmallVector<Value *, 16> Args(IIList.front()->args());


    // Validate function argument and return types, extracting overloaded

    // types along the way.

    SmallVector<Type *, 6> OverloadTys;

    Function *F = IIList.front()->getCalledFunction();

    if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))

      continue;


    Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();

    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

        AMDGPU::getImageDimIntrinsicInfo(IntrinID);


    Type *EltTy = IIList.front()->getType()->getScalarType();

    Type *NewTy = FixedVectorType::get(EltTy, 4);

    OverloadTys[0] = NewTy;

    bool isD16 = EltTy->isHalfTy();


    ConstantInt *DMask = cast<ConstantInt>(

        IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));

    unsigned DMaskVal = DMask->getZExtValue() & 0xf;

    unsigned NumElts = popcount(DMaskVal);


    // Number of instructions and the number of vaddr/vdata dword transfers

    // should be reduced.

    unsigned NumLoads = IIList.size();

    unsigned NumMsaas = NumElts;

    unsigned NumVAddrLoads = 3 * NumLoads;

    unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;

    unsigned NumVAddrMsaas = 3 * NumMsaas;

    unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;


    if (NumLoads < NumMsaas ||

        (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))

      continue;


    const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;

    auto *FragId =

        cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));

    const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;


    // Create the new instructions.

    IRBuilder<> B(IIList.front());


    // Create the new image_msaa_load intrinsic.

    SmallVector<Instruction *, 4> NewCalls;

    while (DMaskVal != 0) {

      unsigned NewMaskVal = 1 << countr_zero(DMaskVal);


      Intrinsic::ID NewIntrinID;

      if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)

        NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;

      else

        NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;


      Args[ImageDimIntr->DMaskIndex] =

          ConstantInt::get(DMask->getType(), NewMaskVal);

      Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);

      CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);

      LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");


      NewCalls.push_back(NewCall);

      DMaskVal -= NewMaskVal;

    }


    // Create the new extractelement instructions.

    for (auto &II : IIList) {

      Value *VecOp = nullptr;

      auto *Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));

      B.SetCurrentDebugLocation(II->getDebugLoc());

      if (NumElts == 1) {

        VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));

        LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");

      } else {

        VecOp = PoisonValue::get(II->getType());

        for (unsigned I = 0; I < NumElts; ++I) {

          VecOp = B.CreateInsertElement(

              VecOp,

              B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);

          LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");

        }

      }


      // Replace the old instruction.

      II->replaceAllUsesWith(VecOp);

      VecOp->takeName(II);

      InstrsToErase.push_back(II);

    }


    Modified = true;

  }


  for (auto *I : InstrsToErase)

    I->eraseFromParent();


  return Modified;

}


static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {

  if (!TM)

    return false;


  // This optimization only applies to GFX11 and beyond.

  const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);

  if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())

    return false;


  Module *M = F.getParent();


  // Early test to determine if the intrinsics are used.

  if (llvm::none_of(*M, [](Function &F) {

        return !F.users().empty() &&

               (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||

                F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);

      }))

    return false;


  bool Modified = false;

  for (auto &BB : F) {

    BasicBlock::iterator SectionEnd;

    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;

         I = SectionEnd) {

      SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;


      SectionEnd = collectMergeableInsts(I, E, MergeableInsts);

      Modified |= optimizeSection(MergeableInsts);

    }

  }


  return Modified;

}


bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {

  if (skipFunction(F))

    return false;


  return imageIntrinsicOptimizerImpl(F, TM);

}


FunctionPass *


llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {

  return new AMDGPUImageIntrinsicOptimizer(TM);

}


PreservedAnalyses


AMDGPUImageIntrinsicOptimizerPass::run(Function &F,

                                       FunctionAnalysisManager &AM) {


  bool Changed = imageIntrinsicOptimizerImpl(F, &TM);

  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const
aarch64 promote const
Definition AArch64PromoteConstant.cpp:228

optimizeSection
bool optimizeSection(ArrayRef< SmallVector< IntrinsicInst *, 4 > > MergeableInsts)
Definition AMDGPUImageIntrinsicOptimizer.cpp:176

addInstToMergeableList
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
Definition AMDGPUImageIntrinsicOptimizer.cpp:98

collectMergeableInsts
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Definition AMDGPUImageIntrinsicOptimizer.cpp:142

imageIntrinsicOptimizerImpl
static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM)
Definition AMDGPUImageIntrinsicOptimizer.cpp:283

AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

AMDGPU.h

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition EntryExitInstrumenter.cpp:103

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

IRBuilder.h

Function.h

IntrinsicInst.h

LoopDeletionResult::Modified
@ Modified
Definition LoopDeletion.cpp:47

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56

Pass.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

char

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::udiv
LLVM_ABI APInt udiv(const APInt &RHS) const
Unsigned division operation.
Definition APInt.cpp:1573

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1511

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition Constants.h:87

llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function
Definition Function.h:64

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition IntrinsicInst.h:49

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::PoisonValue::get
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition Constants.cpp:1888

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1196

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352

llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::takeName
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396

uint8_t

Changed
Changed
Definition ObjCARCOpts.cpp:2370

llvm::AMDGPU
Definition AMDGPUMetadataVerifier.h:34

llvm::AMDGPU::isGFX11Plus
bool isGFX11Plus(const MCSubtargetInfo &STI)
Definition AMDGPUBaseInfo.cpp:2512

llvm::AMDGPU::getImageDimIntrinsicInfo
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::Intrinsic::getIntrinsicSignature
LLVM_ABI bool getIntrinsicSignature(Intrinsic::ID, FunctionType *FT, SmallVectorImpl< Type * > &ArgTys)
Gets the type arguments of an intrinsic call by matching type contraints specified by the ....
Definition Intrinsics.cpp:1052

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644

llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::divideCeil
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:560

llvm::FunctionAnalysisManager
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
Definition PassManager.h:564

llvm::createAMDGPUImageIntrinsicOptimizerPass
FunctionPass * createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *)
Definition AMDGPUImageIntrinsicOptimizer.cpp:325

llvm::popcount
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154

raw_ostream.h

llvm::AMDGPUImageIntrinsicOptimizerPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition AMDGPUImageIntrinsicOptimizer.cpp:330

llvm::AMDGPU::ImageDimIntrinsicInfo
Definition AMDGPUInstrInfo.h:50

llvm::AMDGPU::ImageDimIntrinsicInfo::VAddrEnd
uint8_t VAddrEnd
Definition AMDGPUInstrInfo.h:73

llvm::AMDGPU::ImageDimIntrinsicInfo::DMaskIndex
uint8_t DMaskIndex
Definition AMDGPUInstrInfo.h:64