doxygen/AMDGPUPreloadKernelArguments_8cpp_source.html

//===- AMDGPUPreloadKernelArguments.cpp - Preload Kernel Arguments --------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This pass preloads kernel arguments into user_data SGPRs before kernel

/// execution begins. The number of registers available for preloading depends

/// on the number of free user SGPRs, up to the hardware's maximum limit.

/// Implicit arguments enabled in the kernel descriptor are allocated first,

/// followed by SGPRs used for preloaded kernel arguments. (Reference:

/// https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state)

/// Additionally, hidden kernel arguments may be preloaded, in which case they

/// are appended to the kernel signature after explicit arguments. Preloaded

/// arguments will be marked with `inreg`.

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "AMDGPUTargetMachine.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/Module.h"

#include "llvm/IR/PassManager.h"

#include "llvm/IR/Verifier.h"

#include "llvm/Pass.h"


#define DEBUG_TYPE "amdgpu-preload-kernel-arguments"


using namespace llvm;


static cl::opt<unsigned> KernargPreloadCount(

    "amdgpu-kernarg-preload-count",

    cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));


static cl::opt<bool>

    EnableKernargPreload("amdgpu-kernarg-preload",

                         cl::desc("Enable preload kernel arguments to SGPRs"),

                         cl::init(true));


namespace {


class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass {

  const GCNTargetMachine *TM;


public:

  static char ID;

  explicit AMDGPUPreloadKernelArgumentsLegacy(

      const GCNTargetMachine *TM = nullptr);


  StringRef getPassName() const override {

    return "AMDGPU Preload Kernel Arguments";

  }


  bool runOnModule(Module &M) override;

};


class PreloadKernelArgInfo {

private:

  Function &F;

  const GCNSubtarget &ST;

  unsigned NumFreeUserSGPRs;


  enum HiddenArg : unsigned {

    HIDDEN_BLOCK_COUNT_X,

    HIDDEN_BLOCK_COUNT_Y,

    HIDDEN_BLOCK_COUNT_Z,

    HIDDEN_GROUP_SIZE_X,

    HIDDEN_GROUP_SIZE_Y,

    HIDDEN_GROUP_SIZE_Z,

    HIDDEN_REMAINDER_X,

    HIDDEN_REMAINDER_Y,

    HIDDEN_REMAINDER_Z,

    END_HIDDEN_ARGS

  };


  // Stores information about a specific hidden argument.

  struct HiddenArgInfo {

    // Offset in bytes from the location in the kernearg segment pointed to by

    // the implicitarg pointer.

    uint8_t Offset;

    // The size of the hidden argument in bytes.

    uint8_t Size;

    // The name of the hidden argument in the kernel signature.

    const char *Name;

  };


  static constexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {

      {0, 4, "_hidden_block_count_x"}, {4, 4, "_hidden_block_count_y"},

      {8, 4, "_hidden_block_count_z"}, {12, 2, "_hidden_group_size_x"},

      {14, 2, "_hidden_group_size_y"}, {16, 2, "_hidden_group_size_z"},

      {18, 2, "_hidden_remainder_x"},  {20, 2, "_hidden_remainder_y"},

      {22, 2, "_hidden_remainder_z"}};


  static HiddenArg getHiddenArgFromOffset(unsigned Offset) {

    for (unsigned I = 0; I < END_HIDDEN_ARGS; ++I)

      if (HiddenArgs[I].Offset == Offset)

        return static_cast<HiddenArg>(I);


    return END_HIDDEN_ARGS;

  }


  static Type *getHiddenArgType(LLVMContext &Ctx, HiddenArg HA) {

    if (HA < END_HIDDEN_ARGS)

      return Type::getIntNTy(Ctx, HiddenArgs[HA].Size * 8);


    llvm_unreachable("Unexpected hidden argument.");

  }


  static const char *getHiddenArgName(HiddenArg HA) {

    if (HA < END_HIDDEN_ARGS)

      return HiddenArgs[HA].Name;


    llvm_unreachable("Unexpected hidden argument.");

  }


  // Clones the function after adding implicit arguments to the argument list

  // and returns the new updated function. Preloaded implicit arguments are

  // added up to and including the last one that will be preloaded, indicated by

  // LastPreloadIndex. Currently preloading is only performed on the totality of

  // sequential data from the kernarg segment including implicit (hidden)

  // arguments. This means that all arguments up to the last preloaded argument

  // will also be preloaded even if that data is unused.

  Function *cloneFunctionWithPreloadImplicitArgs(unsigned LastPreloadIndex) {

    FunctionType *FT = F.getFunctionType();

    LLVMContext &Ctx = F.getContext();

    SmallVector<Type *, 16> FTypes(FT->param_begin(), FT->param_end());

    for (unsigned I = 0; I <= LastPreloadIndex; ++I)

      FTypes.push_back(getHiddenArgType(Ctx, HiddenArg(I)));


    FunctionType *NFT =

        FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());

    Function *NF =

        Function::Create(NFT, F.getLinkage(), F.getAddressSpace(), F.getName());


    NF->copyAttributesFrom(&F);

    NF->copyMetadata(&F, 0);


    F.getParent()->getFunctionList().insert(F.getIterator(), NF);

    NF->takeName(&F);

    NF->splice(NF->begin(), &F);


    Function::arg_iterator NFArg = NF->arg_begin();

    for (Argument &Arg : F.args()) {

      Arg.replaceAllUsesWith(&*NFArg);

      NFArg->takeName(&Arg);

      ++NFArg;

    }


    AttrBuilder AB(Ctx);

    AB.addAttribute(Attribute::InReg);

    AB.addAttribute("amdgpu-hidden-argument");

    AttributeList AL = NF->getAttributes();

    for (unsigned I = 0; I <= LastPreloadIndex; ++I) {

      AL = AL.addParamAttributes(Ctx, NFArg->getArgNo(), AB);

      NFArg++->setName(getHiddenArgName(HiddenArg(I)));

    }


    NF->setAttributes(AL);

    F.replaceAllUsesWith(NF);


    return NF;

  }


public:

  PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {

    setInitialFreeUserSGPRsCount();

  }


  // Returns the maximum number of user SGPRs that we have available to preload

  // arguments.

  void setInitialFreeUserSGPRsCount() {

    GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);

    NumFreeUserSGPRs = UserSGPRInfo.getNumFreeUserSGPRs();

  }


  bool canPreloadKernArgAtOffset(uint64_t ExplicitArgOffset) {

    return ExplicitArgOffset <= NumFreeUserSGPRs * 4;

  }


  // Try to allocate SGPRs to preload hidden kernel arguments.

  void

  tryAllocHiddenArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,

                                SmallVectorImpl<Function *> &FunctionsToErase) {

    Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(

        F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);

    if (!ImplicitArgPtr)

      return;


    const DataLayout &DL = F.getParent()->getDataLayout();

    // Pair is the load and the load offset.

    SmallVector<std::pair<LoadInst *, unsigned>, 4> ImplicitArgLoads;

    for (auto *U : ImplicitArgPtr->users()) {

      Instruction *CI = dyn_cast<Instruction>(U);

      if (!CI || CI->getFunction() != &F)

        continue;


      for (auto *U : CI->users()) {

        int64_t Offset = 0;

        auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr?

        if (!Load) {

          if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)

            continue;


          Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?

        }


        if (!Load || !Load->isSimple())

          continue;


        // FIXME: Expand handle merged loads.

        LLVMContext &Ctx = F.getContext();

        Type *LoadTy = Load->getType();

        HiddenArg HA = getHiddenArgFromOffset(Offset);

        if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))

          continue;


        ImplicitArgLoads.push_back(std::make_pair(Load, Offset));

      }

    }


    if (ImplicitArgLoads.empty())

      return;


    // Allocate loads in order of offset. We need to be sure that the implicit

    // argument can actually be preloaded.

    std::sort(ImplicitArgLoads.begin(), ImplicitArgLoads.end(), less_second());


    // If we fail to preload any implicit argument we know we don't have SGPRs

    // to preload any subsequent ones with larger offsets. Find the first

    // argument that we cannot preload.

    auto *PreloadEnd = llvm::find_if(

        ImplicitArgLoads, [&](const std::pair<LoadInst *, unsigned> &Load) {

          unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());

          unsigned LoadOffset = Load.second;

          if (!canPreloadKernArgAtOffset(LoadOffset + LoadSize +

                                         ImplicitArgsBaseOffset))

            return true;


          return false;

        });


    if (PreloadEnd == ImplicitArgLoads.begin())

      return;


    unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);

    Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);

    assert(NF);

    FunctionsToErase.push_back(&F);

    for (const auto *I = ImplicitArgLoads.begin(); I != PreloadEnd; ++I) {

      LoadInst *LoadInst = I->first;

      unsigned LoadOffset = I->second;

      unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);

      unsigned Index = NF->arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;

      Argument *Arg = NF->getArg(Index);

      LoadInst->replaceAllUsesWith(Arg);

    }

  }

};


} // end anonymous namespace


char AMDGPUPreloadKernelArgumentsLegacy::ID = 0;


INITIALIZE_PASS(AMDGPUPreloadKernelArgumentsLegacy, DEBUG_TYPE,

                "AMDGPU Preload Kernel Arguments", false, false)


ModulePass *

llvm::createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *TM) {

  return new AMDGPUPreloadKernelArgumentsLegacy(

      static_cast<const GCNTargetMachine *>(TM));

}


AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy(

    const GCNTargetMachine *TM)

    : ModulePass(ID), TM(TM) {}


static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) {

  if (!EnableKernargPreload)

    return false;


  SmallVector<Function *, 4> FunctionsToErase;

  bool Changed = false;

  for (auto &F : M) {

    const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

    if (!ST.hasKernargPreload() ||

        F.getCallingConv() != CallingConv::AMDGPU_KERNEL)

      continue;


    PreloadKernelArgInfo PreloadInfo(F, ST);

    uint64_t ExplicitArgOffset = 0;

    const DataLayout &DL = F.getDataLayout();

    const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();

    unsigned NumPreloadsRequested = KernargPreloadCount;

    unsigned NumPreloadedExplicitArgs = 0;

    for (Argument &Arg : F.args()) {

      // Avoid incompatible attributes and guard against running this pass

      // twice.

      //

      // TODO: Preload byref kernel arguments

      if (Arg.hasByRefAttr() || Arg.hasNestAttr() ||

          Arg.hasAttribute("amdgpu-hidden-argument"))

        break;


      // Inreg may be pre-existing on some arguments, try to preload these.

      if (NumPreloadsRequested == 0 && !Arg.hasInRegAttr())

        break;


      // FIXME: Preload aggregates.

      if (Arg.getType()->isAggregateType())

        break;


      Type *ArgTy = Arg.getType();

      Align ABITypeAlign = DL.getABITypeAlign(ArgTy);

      uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

      ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;


      if (!PreloadInfo.canPreloadKernArgAtOffset(ExplicitArgOffset))

        break;


      Arg.addAttr(Attribute::InReg);

      NumPreloadedExplicitArgs++;

      if (NumPreloadsRequested > 0)

        NumPreloadsRequested--;

    }


    // Only try preloading hidden arguments if we can successfully preload the

    // last explicit argument.

    if (NumPreloadedExplicitArgs == F.arg_size()) {

      uint64_t ImplicitArgsBaseOffset =

          alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +

          BaseOffset;

      PreloadInfo.tryAllocHiddenArgPreloadSGPRs(ImplicitArgsBaseOffset,

                                                FunctionsToErase);

    }


    Changed |= NumPreloadedExplicitArgs > 0;

  }


  Changed |= !FunctionsToErase.empty();

  // Erase cloned functions if we needed to update the kernel signature to

  // support preloading hidden kernel arguments.

  for (auto *F : FunctionsToErase)

    F->eraseFromParent();


  return Changed;

}


bool AMDGPUPreloadKernelArgumentsLegacy::runOnModule(Module &M) {

  if (skipModule(M) || !TM)

    return false;


  return markKernelArgsAsInreg(M, *TM);

}


PreservedAnalyses


AMDGPUPreloadKernelArgumentsPass::run(Module &M, ModuleAnalysisManager &AM) {

  bool Changed = markKernelArgsAsInreg(M, TM);

  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

const
aarch64 promote const
Definition AArch64PromoteConstant.cpp:228

markKernelArgsAsInreg
static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM)
Definition AMDGPUPreloadKernelArguments.cpp:282

KernargPreloadCount
static cl::opt< unsigned > KernargPreloadCount("amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0))

EnableKernargPreload
static cl::opt< bool > EnableKernargPreload("amdgpu-kernarg-preload", cl::desc("Enable preload kernel arguments to SGPRs"), cl::init(true))

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

AMDGPU.h

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

Function.h

Module.h
Module.h This file contains the declarations for the Module class.

PassManager.h
This header defines various interfaces for pass management in LLVM.

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

Instructions.h

TemplateParamKind::Type
@ Type
Definition ItaniumDemangle.h:1243

F
#define F(x, y, z)
Definition MD5.cpp:54

I
#define I(x, y, z)
Definition MD5.cpp:57

Module
Machine Check Debug Module
Definition MachineCheckDebugify.cpp:124

INITIALIZE_PASS
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56

Pass.h

ValueTracking.h

Verifier.h

llvm::AMDGPUPreloadKernelArgumentsPass::run
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
Definition AMDGPUPreloadKernelArguments.cpp:361

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::Argument::getArgNo
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::Function::Create
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166

llvm::Function::splice
void splice(Function::iterator ToIt, Function *FromF)
Transfer all blocks from FromF to this function at ToIt.
Definition Function.h:759

llvm::Function::arg_iterator
Argument * arg_iterator
Definition Function.h:72

llvm::Function::getAttributes
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352

llvm::Function::begin
iterator begin()
Definition Function.h:851

llvm::Function::arg_begin
arg_iterator arg_begin()
Definition Function.h:866

llvm::Function::setAttributes
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355

llvm::Function::arg_size
size_t arg_size() const
Definition Function.h:899

llvm::Function::getArg
Argument * getArg(unsigned i) const
Definition Function.h:884

llvm::Function::copyAttributesFrom
void copyAttributesFrom(const Function *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...
Definition Function.cpp:859

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::GCNTargetMachine
Definition AMDGPUTargetMachine.h:81

llvm::GlobalObject::copyMetadata
LLVM_ABI void copyMetadata(const GlobalObject *Src, unsigned Offset)
Copy metadata from Src, adjusting offsets by Offset.
Definition Metadata.cpp:1868

llvm::Instruction::getFunction
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
Definition Instruction.cpp:86

llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition MachineBasicBlock.h:323

llvm::MachineFunction::getDataLayout
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Definition MachineFunction.cpp:309

llvm::ModulePass
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition Pass.h:255

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67

llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:417

llvm::SmallVectorTemplateCommon::end
iterator end()
Definition SmallVector.h:273

llvm::SmallVectorTemplateCommon::begin
iterator begin()
Definition SmallVector.h:271

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:83

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1203

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetMachine::getSubtarget
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Definition TargetMachine.h:199

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Value::setName
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::takeName
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396

llvm::cl::opt
Definition CommandLine.h:1455

uint64_t

Changed
Changed
Definition ObjCARCOpts.cpp:2369

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

llvm::AArch64CC::AL
@ AL
Definition AArch64BaseInfo.h:269

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition CallingConv.h:200

llvm::Intrinsic::getDeclarationIfExists
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
Definition Intrinsics.cpp:792

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::SPII::Load
@ Load
Definition SparcInstrInfo.h:32

llvm::X86::SecondMacroFusionInstKind::AB
@ AB
Definition X86BaseInfo.h:117

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:445

llvm::codeview::PublicSymFlags::Function
@ Function
Definition CodeView.h:408

llvm::dwarf::Index
Index
Definition Dwarf.h:903

llvm::logicalview::LVAttributeKind::Argument
@ Argument
Definition LVOptions.h:95

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::Offset
@ Offset
Definition DWP.cpp:532

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition ValueTracking.h:344

llvm::SmallVector
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
Definition SmallVector.h:1129

llvm::createAMDGPUPreloadKernelArgumentsLegacyPass
ModulePass * createAMDGPUPreloadKernelArgumentsLegacyPass(const TargetMachine *)

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758

llvm::ModuleAnalysisManager
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::cl::desc
Definition CommandLine.h:411