doxygen/AMDGPULowerKernelArguments_8cpp_source.html

//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This pass replaces accesses to kernel arguments with loads from

/// offsets from the kernarg base pointer.

//

//===----------------------------------------------------------------------===//


#include "AMDGPU.h"

#include "GCNSubtarget.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/Target/TargetMachine.h"


#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"


using namespace llvm;


namespace {


class AMDGPULowerKernelArguments : public FunctionPass {

public:

  static char ID;


  AMDGPULowerKernelArguments() : FunctionPass(ID) {}


  bool runOnFunction(Function &F) override;


  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.addRequired<TargetPassConfig>();

    AU.setPreservesAll();

 }

};


} // end anonymous namespace


// skip allocas


static BasicBlock::iterator getInsertPt(BasicBlock &BB) {

  BasicBlock::iterator InsPt = BB.getFirstInsertionPt();

  for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {

    AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);


    // If this is a dynamic alloca, the value may depend on the loaded kernargs,

    // so loads will need to be inserted before it.

    if (!AI || !AI->isStaticAlloca())

      break;

  }


  return InsPt;

}


static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {

  CallingConv::ID CC = F.getCallingConv();

  if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())

    return false;


  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

  LLVMContext &Ctx = F.getParent()->getContext();

  const DataLayout &DL = F.getDataLayout();

  BasicBlock &EntryBlock = *F.begin();

  IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));


  const Align KernArgBaseAlign(16); // FIXME: Increase if necessary

  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();


  Align MaxAlign;

  // FIXME: Alignment is broken with explicit arg offset.;

  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);

  if (TotalKernArgSize == 0)

    return false;


  CallInst *KernArgSegment =

      Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {},

                              nullptr, F.getName() + ".kernarg.segment");

  KernArgSegment->addRetAttr(Attribute::NonNull);

  KernArgSegment->addRetAttr(

      Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));


  uint64_t ExplicitArgOffset = 0;

  for (Argument &Arg : F.args()) {

    const bool IsByRef = Arg.hasByRefAttr();

    Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();

    MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;

    Align ABITypeAlign = DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);


    uint64_t Size = DL.getTypeSizeInBits(ArgTy);

    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);


    uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;

    ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;


    // Skip inreg arguments which should be preloaded.

    if (Arg.use_empty() || Arg.hasInRegAttr())

      continue;


    // If this is byval, the loads are already explicit in the function. We just

    // need to rewrite the pointer values.

    if (IsByRef) {

      Value *ArgOffsetPtr = Builder.CreateConstInBoundsGEP1_64(

          Builder.getInt8Ty(), KernArgSegment, EltOffset,

          Arg.getName() + ".byval.kernarg.offset");


      Value *CastOffsetPtr =

          Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType());

      Arg.replaceAllUsesWith(CastOffsetPtr);

      continue;

    }


    if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {

      // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing

      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We

      // can't represent this with range metadata because it's only allowed for

      // integer types.

      if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||

           PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&

          !ST.hasUsableDSOffset())

        continue;


      // FIXME: We can replace this with equivalent alias.scope/noalias

      // metadata, but this appears to be a lot of work.

      if (Arg.hasNoAliasAttr())

        continue;

    }


    auto *VT = dyn_cast<FixedVectorType>(ArgTy);

    bool IsV3 = VT && VT->getNumElements() == 3;

    bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();


    VectorType *V4Ty = nullptr;


    int64_t AlignDownOffset = alignDown(EltOffset, 4);

    int64_t OffsetDiff = EltOffset - AlignDownOffset;

    Align AdjustedAlign = commonAlignment(

        KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);


    Value *ArgPtr;

    Type *AdjustedArgTy;

    if (DoShiftOpt) { // FIXME: Handle aggregate types

      // Since we don't have sub-dword scalar loads, avoid doing an extload by

      // loading earlier than the argument address, and extracting the relevant

      // bits.

      // TODO: Update this for GFX12 which does have scalar sub-dword loads.

      //

      // Additionally widen any sub-dword load to i32 even if suitably aligned,

      // so that CSE between different argument loads works easily.

      ArgPtr = Builder.CreateConstInBoundsGEP1_64(

          Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,

          Arg.getName() + ".kernarg.offset.align.down");

      AdjustedArgTy = Builder.getInt32Ty();

    } else {

      ArgPtr = Builder.CreateConstInBoundsGEP1_64(

          Builder.getInt8Ty(), KernArgSegment, EltOffset,

          Arg.getName() + ".kernarg.offset");

      AdjustedArgTy = ArgTy;

    }


    if (IsV3 && Size >= 32) {

      V4Ty = FixedVectorType::get(VT->getElementType(), 4);

      // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads

      AdjustedArgTy = V4Ty;

    }


    LoadInst *Load =

        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);

    Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));


    MDBuilder MDB(Ctx);


    if (Arg.hasAttribute(Attribute::NoUndef))

      Load->setMetadata(LLVMContext::MD_noundef, MDNode::get(Ctx, {}));


    if (Arg.hasAttribute(Attribute::Range)) {

      const ConstantRange &Range =

          Arg.getAttribute(Attribute::Range).getValueAsConstantRange();

      Load->setMetadata(LLVMContext::MD_range,

                        MDB.createRange(Range.getLower(), Range.getUpper()));

    }


    if (isa<PointerType>(ArgTy)) {

      if (Arg.hasNonNullAttr())

        Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));


      uint64_t DerefBytes = Arg.getDereferenceableBytes();

      if (DerefBytes != 0) {

        Load->setMetadata(

          LLVMContext::MD_dereferenceable,

          MDNode::get(Ctx,

                      MDB.createConstant(

                        ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));

      }


      uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();

      if (DerefOrNullBytes != 0) {

        Load->setMetadata(

          LLVMContext::MD_dereferenceable_or_null,

          MDNode::get(Ctx,

                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),

                                                          DerefOrNullBytes))));

      }


      if (MaybeAlign ParamAlign = Arg.getParamAlign()) {

        Load->setMetadata(

            LLVMContext::MD_align,

            MDNode::get(Ctx, MDB.createConstant(ConstantInt::get(

                                 Builder.getInt64Ty(), ParamAlign->value()))));

      }

    }


    // TODO: Convert noalias arg to !noalias


    if (DoShiftOpt) {

      Value *ExtractBits = OffsetDiff == 0 ?

        Load : Builder.CreateLShr(Load, OffsetDiff * 8);


      IntegerType *ArgIntTy = Builder.getIntNTy(Size);

      Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);

      Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,

                                            Arg.getName() + ".load");

      Arg.replaceAllUsesWith(NewVal);

    } else if (IsV3) {

      Value *Shuf = Builder.CreateShuffleVector(Load, ArrayRef<int>{0, 1, 2},

                                                Arg.getName() + ".load");

      Arg.replaceAllUsesWith(Shuf);

    } else {

      Load->setName(Arg.getName() + ".load");

      Arg.replaceAllUsesWith(Load);

    }

  }


  KernArgSegment->addRetAttr(

      Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));


  return true;

}


bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {

  auto &TPC = getAnalysis<TargetPassConfig>();

  const TargetMachine &TM = TPC.getTM<TargetMachine>();

  return lowerKernelArguments(F, TM);

}


INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,

                      "AMDGPU Lower Kernel Arguments", false, false)

INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",

                    false, false)


char AMDGPULowerKernelArguments::ID = 0;


FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {

  return new AMDGPULowerKernelArguments();

}


PreservedAnalyses


AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) {

  bool Changed = lowerKernelArguments(F, TM);

  if (Changed) {

    // TODO: Preserves a lot more.

    PreservedAnalyses PA;

    PA.preserveSet<CFGAnalyses>();

    return PA;

  }


  return PreservedAnalyses::all();

}


getInsertPt
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
Definition AMDGPULowerKernelArguments.cpp:47

lowerKernelArguments
static bool lowerKernelArguments(Function &F, const TargetMachine &TM)
Definition AMDGPULowerKernelArguments.cpp:61

AMDGPU.h

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...

E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition EntryExitInstrumenter.cpp:103

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

DEBUG_TYPE
#define DEBUG_TYPE
Definition GenericCycleImpl.h:31

IRBuilder.h

InlinePriorityMode::Size
@ Size
Definition InlineOrder.cpp:25

F
#define F(x, y, z)
Definition MD5.cpp:55

MDBuilder.h

Range
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

ValueTracking.h

PointerType
Definition ItaniumDemangle.h:639

VectorType
Definition ItaniumDemangle.h:1189

llvm::AMDGPULowerKernelArgumentsPass::run
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
Definition AMDGPULowerKernelArguments.cpp:263

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition Instructions.h:65

llvm::AllocaInst::isStaticAlloca
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition Instructions.cpp:1303

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition PassAnalysisSupport.h:48

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition PassAnalysisSupport.h:131

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41

llvm::Attribute::getWithDereferenceableBytes
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
Definition Attributes.cpp:244

llvm::Attribute::getWithAlignment
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition Attributes.cpp:234

llvm::BasicBlock
LLVM Basic Block Representation.
Definition BasicBlock.h:62

llvm::BasicBlock::end
iterator end()
Definition BasicBlock.h:472

llvm::BasicBlock::getFirstInsertionPt
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition BasicBlock.cpp:393

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170

llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73

llvm::CallBase::addRetAttr
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition InstrTypes.h:1489

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1511

llvm::ConstantRange
This class represents a range of values.
Definition ConstantRange.h:47

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::FixedVectorType::get
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function
Definition Function.h:64

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2783

llvm::IntegerType
Class to represent integer types.
Definition DerivedTypes.h:42

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::MDBuilder
Definition MDBuilder.h:37

llvm::MDBuilder::createConstant
LLVM_ABI ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
Definition MDBuilder.cpp:25

llvm::MDBuilder::createRange
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96

llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PreservedAnalyses::preserveSet
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition TargetMachine.h:83

llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition TargetPassConfig.h:84

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::getInt32Ty
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297

llvm::Type::isAggregateType
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304

llvm::Type::getIntNTy
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

uint64_t

Changed
Changed
Definition ObjCARCOpts.cpp:2370

TargetMachine.h

false
Definition MachinePipeliner.cpp:247

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPUAddrSpace.h:34

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition CallingConv.h:200

llvm::SystemZISD::TM
@ TM
Definition SystemZISelLowering.h:66

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644

llvm::alignDown
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557

llvm::createAMDGPULowerKernelArgumentsPass
FunctionPass * createAMDGPULowerKernelArgumentsPass()
Definition AMDGPULowerKernelArguments.cpp:258

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144

llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201

llvm::FunctionAnalysisManager
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
Definition PassManager.h:564

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106