doxygen/NVPTXLowerArgs_8cpp_source.html

//===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

//

// Arguments to kernel and device functions are passed via param space,

// which imposes certain restrictions:

// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces

//

// Kernel parameters are read-only and accessible only via ld.param

// instruction, directly or via a pointer.

//

// Device function parameters are directly accessible via

// ld.param/st.param, but taking the address of one returns a pointer

// to a copy created in local space which *can't* be used with

// ld.param/st.param.

//

// Copying a byval struct into local memory in IR allows us to enforce

// the param space restrictions, gives the rest of IR a pointer w/o

// param space restrictions, and gives us an opportunity to eliminate

// the copy.

//

// Pointer arguments to kernel functions need more work to be lowered:

//

// 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the

//    global address space. This allows later optimizations to emit

//    ld.global.*/st.global.* for accessing these pointer arguments. For

//    example,

//

//    define void @foo(float* %input) {

//      %v = load float, float* %input, align 4

//      ...

//    }

//

//    becomes

//

//    define void @foo(float* %input) {

//      %input2 = addrspacecast float* %input to float addrspace(1)*

//      %input3 = addrspacecast float addrspace(1)* %input2 to float*

//      %v = load float, float* %input3, align 4

//      ...

//    }

//

//    Later, NVPTXInferAddressSpaces will optimize it to

//

//    define void @foo(float* %input) {

//      %input2 = addrspacecast float* %input to float addrspace(1)*

//      %v = load float, float addrspace(1)* %input2, align 4

//      ...

//    }

//

// 2. Convert byval kernel parameters to pointers in the param address space

//    (so that NVPTX emits ld/st.param).  Convert pointers *within* a byval

//    kernel parameter to pointers in the global address space. This allows

//    NVPTX to emit ld/st.global.

//

//    struct S {

//      int *x;

//      int *y;

//    };

//    __global__ void foo(S s) {

//      int *b = s.y;

//      // use b

//    }

//

//    "b" points to the global address space. In the IR level,

//

//    define void @foo(ptr byval %input) {

//      %b_ptr = getelementptr {ptr, ptr}, ptr %input, i64 0, i32 1

//      %b = load ptr, ptr %b_ptr

//      ; use %b

//    }

//

//    becomes

//

//    define void @foo({i32*, i32*}* byval %input) {

//      %b_param = addrspacecat ptr %input to ptr addrspace(101)

//      %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1

//      %b = load ptr, ptr addrspace(101) %b_ptr

//      %b_global = addrspacecast ptr %b to ptr addrspace(1)

//      ; use %b_generic

//    }

//

//    Create a local copy of kernel byval parameters used in a way that *might* mutate

//    the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters

//    are undefined behaviour, and don't require local copies.

//

//    define void @foo(ptr byval(%struct.s) align 4 %input) {

//       store i32 42, ptr %input

//       ret void

//    }

//

//    becomes

//

//    define void @foo(ptr byval(%struct.s) align 4 %input) #1 {

//      %input1 = alloca %struct.s, align 4

//      %input2 = addrspacecast ptr %input to ptr addrspace(101)

//      %input3 = load %struct.s, ptr addrspace(101) %input2, align 4

//      store %struct.s %input3, ptr %input1, align 4

//      store i32 42, ptr %input1, align 4

//      ret void

//    }

//

//    If %input were passed to a device function, or written to memory,

//    conservatively assume that %input gets mutated, and create a local copy.

//

//    Convert param pointers to grid_constant byval kernel parameters that are

//    passed into calls (device functions, intrinsics, inline asm), or otherwise

//    "escape" (into stores/ptrtoints) to the generic address space, using the

//    `nvvm.ptr.param.to.gen` intrinsic, so that NVPTX emits cvta.param

//    (available for sm70+)

//

//    define void @foo(ptr byval(%struct.s) %input) {

//      ; %input is a grid_constant

//      %call = call i32 @escape(ptr %input)

//      ret void

//    }

//

//    becomes

//

//    define void @foo(ptr byval(%struct.s) %input) {

//      %input1 = addrspacecast ptr %input to ptr addrspace(101)

//      ; the following intrinsic converts pointer to generic. We don't use an addrspacecast

//      ; to prevent generic -> param -> generic from getting cancelled out

//      %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)

//      %call = call i32 @escape(ptr %input1.gen)

//      ret void

//    }

//

// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't

// cancel the addrspacecast pair this pass emits.

//===----------------------------------------------------------------------===//


#include "MCTargetDesc/NVPTXBaseInfo.h"

#include "NVPTX.h"

#include "NVPTXTargetMachine.h"

#include "NVPTXUtilities.h"

#include "llvm/ADT/STLExtras.h"

#include "llvm/Analysis/PtrUseVisitor.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/TargetPassConfig.h"

#include "llvm/IR/Function.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/Instructions.h"

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/IntrinsicsNVPTX.h"

#include "llvm/IR/Type.h"

#include "llvm/InitializePasses.h"

#include "llvm/Pass.h"

#include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/NVPTXAddrSpace.h"

#include <numeric>

#include <queue>


#define DEBUG_TYPE "nvptx-lower-args"


using namespace llvm;


namespace {

class NVPTXLowerArgsLegacyPass : public FunctionPass {

  bool runOnFunction(Function &F) override;


public:

  static char ID; // Pass identification, replacement for typeid

  NVPTXLowerArgsLegacyPass() : FunctionPass(ID) {}

  StringRef getPassName() const override {

    return "Lower pointer arguments of CUDA kernels";

  }

  void getAnalysisUsage(AnalysisUsage &AU) const override {

    AU.addRequired<TargetPassConfig>();

  }

};

} // namespace


char NVPTXLowerArgsLegacyPass::ID = 1;


INITIALIZE_PASS_BEGIN(NVPTXLowerArgsLegacyPass, "nvptx-lower-args",

                      "Lower arguments (NVPTX)", false, false)

INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)

INITIALIZE_PASS_END(NVPTXLowerArgsLegacyPass, "nvptx-lower-args",

                    "Lower arguments (NVPTX)", false, false)


// =============================================================================

// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),

// and we can't guarantee that the only accesses are loads,

// then add the following instructions to the first basic block:

//

// %temp = alloca %struct.x, align 8

// %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)*

// %tv = load %struct.x addrspace(101)* %tempd

// store %struct.x %tv, %struct.x* %temp, align 8

//

// The above code allocates some space in the stack and copies the incoming

// struct from param space to local space.

// Then replace all occurrences of %d by %temp.

//

// In case we know that all users are GEPs or Loads, replace them with the same

// ones in parameter AS, so we can access them using ld.param.

// =============================================================================


// For Loads, replaces the \p OldUse of the pointer with a Use of the same

// pointer in parameter AS.

// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to

// generic using cvta.param.


static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,

                             bool IsGridConstant) {

  Instruction *I = dyn_cast<Instruction>(OldUse->getUser());

  assert(I && "OldUse must be in an instruction");

  struct IP {

    Use *OldUse;

    Instruction *OldInstruction;

    Value *NewParam;

  };

  SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};

  SmallVector<Instruction *> InstructionsToDelete;


  auto CloneInstInParamAS = [HasCvtaParam,

                             IsGridConstant](const IP &I) -> Value * {

    if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {

      LI->setOperand(0, I.NewParam);

      return LI;

    }

    if (auto *GEP = dyn_cast<GetElementPtrInst>(I.OldInstruction)) {

      SmallVector<Value *, 4> Indices(GEP->indices());

      auto *NewGEP = GetElementPtrInst::Create(

          GEP->getSourceElementType(), I.NewParam, Indices, GEP->getName(),

          GEP->getIterator());

      NewGEP->setIsInBounds(GEP->isInBounds());

      return NewGEP;

    }

    if (auto *BC = dyn_cast<BitCastInst>(I.OldInstruction)) {

      auto *NewBCType = PointerType::get(BC->getContext(), ADDRESS_SPACE_PARAM);

      return BitCastInst::Create(BC->getOpcode(), I.NewParam, NewBCType,

                                 BC->getName(), BC->getIterator());

    }

    if (auto *ASC = dyn_cast<AddrSpaceCastInst>(I.OldInstruction)) {

      assert(ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM);

      (void)ASC;

      // Just pass through the argument, the old ASC is no longer needed.

      return I.NewParam;

    }

    if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {

      if (MI->getRawSource() == I.OldUse->get()) {

        // convert to memcpy/memmove from param space.

        IRBuilder<> Builder(I.OldInstruction);

        Intrinsic::ID ID = MI->getIntrinsicID();


        CallInst *B = Builder.CreateMemTransferInst(

            ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,

            MI->getSourceAlign(), MI->getLength(), MI->isVolatile());

        for (unsigned I : {0, 1})

          if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))

            B->addDereferenceableParamAttr(I, Bytes);

        return B;

      }

      // We may be able to handle other cases if the argument is

      // __grid_constant__

    }


    if (HasCvtaParam) {

      auto GetParamAddrCastToGeneric =

          [](Value *Addr, Instruction *OriginalUser) -> Value * {

        IRBuilder<> IRB(OriginalUser);

        Type *GenTy = IRB.getPtrTy(ADDRESS_SPACE_GENERIC);

        return IRB.CreateAddrSpaceCast(Addr, GenTy, Addr->getName() + ".gen");

      };

      auto *ParamInGenericAS =

          GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);


      // phi/select could use generic arg pointers w/o __grid_constant__

      if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {

        for (auto [Idx, V] : enumerate(PHI->incoming_values())) {

          if (V.get() == I.OldUse->get())

            PHI->setIncomingValue(Idx, ParamInGenericAS);

        }

      }

      if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {

        if (SI->getTrueValue() == I.OldUse->get())

          SI->setTrueValue(ParamInGenericAS);

        if (SI->getFalseValue() == I.OldUse->get())

          SI->setFalseValue(ParamInGenericAS);

      }


      // Escapes or writes can only use generic param pointers if

      // __grid_constant__ is in effect.

      if (IsGridConstant) {

        if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {

          I.OldUse->set(ParamInGenericAS);

          return CI;

        }

        if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {

          // byval address is being stored, cast it to generic

          if (SI->getValueOperand() == I.OldUse->get())

            SI->setOperand(0, ParamInGenericAS);

          return SI;

        }

        if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {

          if (PI->getPointerOperand() == I.OldUse->get())

            PI->setOperand(0, ParamInGenericAS);

          return PI;

        }

        // TODO: iIf we allow stores, we should allow memcpy/memset to

        // parameter, too.

      }

    }


    llvm_unreachable("Unsupported instruction");

  };


  while (!ItemsToConvert.empty()) {

    IP I = ItemsToConvert.pop_back_val();

    Value *NewInst = CloneInstInParamAS(I);


    if (NewInst && NewInst != I.OldInstruction) {

      // We've created a new instruction. Queue users of the old instruction to

      // be converted and the instruction itself to be deleted. We can't delete

      // the old instruction yet, because it's still in use by a load somewhere.

      for (Use &U : I.OldInstruction->uses())

        ItemsToConvert.push_back({&U, cast<Instruction>(U.getUser()), NewInst});


      InstructionsToDelete.push_back(I.OldInstruction);

    }

  }


  // Now we know that all argument loads are using addresses in parameter space

  // and we can finally remove the old instructions in generic AS.  Instructions

  // scheduled for removal should be processed in reverse order so the ones

  // closest to the load are deleted first. Otherwise they may still be in use.

  // E.g if we have Value = Load(BitCast(GEP(arg))), InstructionsToDelete will

  // have {GEP,BitCast}. GEP can't be deleted first, because it's still used by

  // the BitCast.

  for (Instruction *I : llvm::reverse(InstructionsToDelete))

    I->eraseFromParent();

}


// Adjust alignment of arguments passed byval in .param address space. We can

// increase alignment of such arguments in a way that ensures that we can

// effectively vectorize their loads. We should also traverse all loads from

// byval pointer and adjust their alignment, if those were using known offset.

// Such alignment changes must be conformed with parameter store and load in

// NVPTXTargetLowering::LowerCall.


static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,

                                    const NVPTXTargetLowering *TLI) {

  Function *Func = Arg->getParent();

  Type *StructType = Arg->getParamByValType();

  const DataLayout &DL = Func->getDataLayout();


  const Align NewArgAlign =

      TLI->getFunctionParamOptimizedAlign(Func, StructType, DL);

  const Align CurArgAlign = Arg->getParamAlign().valueOrOne();


  if (CurArgAlign >= NewArgAlign)

    return;


  LLVM_DEBUG(dbgs() << "Try to use alignment " << NewArgAlign.value()

                    << " instead of " << CurArgAlign.value() << " for " << *Arg

                    << '\n');


  auto NewAlignAttr =

      Attribute::getWithAlignment(Func->getContext(), NewArgAlign);

  Arg->removeAttr(Attribute::Alignment);

  Arg->addAttr(NewAlignAttr);


  struct Load {

    LoadInst *Inst;

    uint64_t Offset;

  };


  struct LoadContext {

    Value *InitialVal;

    uint64_t Offset;

  };


  SmallVector<Load> Loads;

  std::queue<LoadContext> Worklist;

  Worklist.push({ArgInParamAS, 0});


  while (!Worklist.empty()) {

    LoadContext Ctx = Worklist.front();

    Worklist.pop();


    for (User *CurUser : Ctx.InitialVal->users()) {

      if (auto *I = dyn_cast<LoadInst>(CurUser))

        Loads.push_back({I, Ctx.Offset});

      else if (isa<BitCastInst>(CurUser) || isa<AddrSpaceCastInst>(CurUser))

        Worklist.push({cast<Instruction>(CurUser), Ctx.Offset});

      else if (auto *I = dyn_cast<GetElementPtrInst>(CurUser)) {

        APInt OffsetAccumulated =

            APInt::getZero(DL.getIndexSizeInBits(ADDRESS_SPACE_PARAM));


        if (!I->accumulateConstantOffset(DL, OffsetAccumulated))

          continue;


        uint64_t OffsetLimit = -1;

        uint64_t Offset = OffsetAccumulated.getLimitedValue(OffsetLimit);

        assert(Offset != OffsetLimit && "Expect Offset less than UINT64_MAX");


        Worklist.push({I, Ctx.Offset + Offset});

      }

    }

  }


  for (Load &CurLoad : Loads) {

    Align NewLoadAlign(std::gcd(NewArgAlign.value(), CurLoad.Offset));

    Align CurLoadAlign = CurLoad.Inst->getAlign();

    CurLoad.Inst->setAlignment(std::max(NewLoadAlign, CurLoadAlign));

  }

}


// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the

// alignment of the return value based on the alignment of the argument.


static CallInst *createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB,

                                                 Argument &Arg) {

  CallInst *ArgInParam =

      IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,

                          {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},

                          &Arg, {}, Arg.getName() + ".param");


  if (MaybeAlign ParamAlign = Arg.getParamAlign())

    ArgInParam->addRetAttr(

        Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));


  return ArgInParam;

}


namespace {

struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {

  using Base = PtrUseVisitor<ArgUseChecker>;


  bool IsGridConstant;

  // Set of phi/select instructions using the Arg

  SmallPtrSet<Instruction *, 4> Conditionals;


  ArgUseChecker(const DataLayout &DL, bool IsGridConstant)

      : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}


  PtrInfo visitArgPtr(Argument &A) {

    assert(A.getType()->isPointerTy());

    IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));

    IsOffsetKnown = false;

    Offset = APInt(IntIdxTy->getBitWidth(), 0);

    PI.reset();

    Conditionals.clear();


    LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");

    // Enqueue the uses of this pointer.

    enqueueUsers(A);


    // Visit all the uses off the worklist until it is empty.

    // Note that unlike PtrUseVisitor we intentionally do not track offsets.

    // We're only interested in how we use the pointer.

    while (!(Worklist.empty() || PI.isAborted())) {

      UseToVisit ToVisit = Worklist.pop_back_val();

      U = ToVisit.UseAndIsOffsetKnown.getPointer();

      Instruction *I = cast<Instruction>(U->getUser());

      if (isa<PHINode>(I) || isa<SelectInst>(I))

        Conditionals.insert(I);

      LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");

      Base::visit(I);

    }

    if (PI.isEscaped())

      LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()

                        << "\n");

    else if (PI.isAborted())

      LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()

                        << "\n");

    LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size()

                      << " conditionals\n");

    return PI;

  }


  void visitStoreInst(StoreInst &SI) {

    // Storing the pointer escapes it.

    if (U->get() == SI.getValueOperand())

      return PI.setEscapedAndAborted(&SI);

    // Writes to the pointer are UB w/ __grid_constant__, but do not force a

    // copy.

    if (!IsGridConstant)

      return PI.setAborted(&SI);

  }


  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {

    // ASC to param space are no-ops and do not need a copy

    if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)

      return PI.setEscapedAndAborted(&ASC);

    Base::visitAddrSpaceCastInst(ASC);

  }


  void visitPtrToIntInst(PtrToIntInst &I) {

    if (IsGridConstant)

      return;

    Base::visitPtrToIntInst(I);

  }

  void visitPHINodeOrSelectInst(Instruction &I) {

    assert(isa<PHINode>(I) || isa<SelectInst>(I));

  }

  // PHI and select just pass through the pointers.

  void visitPHINode(PHINode &PN) { enqueueUsers(PN); }

  void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }


  void visitMemTransferInst(MemTransferInst &II) {

    if (*U == II.getRawDest() && !IsGridConstant)

      PI.setAborted(&II);

    // memcpy/memmove are OK when the pointer is source. We can convert them to

    // AS-specific memcpy.

  }


  void visitMemSetInst(MemSetInst &II) {

    if (!IsGridConstant)

      PI.setAborted(&II);

  }

}; // struct ArgUseChecker


void copyByValParam(Function &F, Argument &Arg) {

  LLVM_DEBUG(dbgs() << "Creating a local copy of " << Arg << "\n");

  // Otherwise we have to create a temporary copy.

  BasicBlock::iterator FirstInst = F.getEntryBlock().begin();

  Type *StructType = Arg.getParamByValType();

  const DataLayout &DL = F.getDataLayout();

  IRBuilder<> IRB(&*FirstInst);

  AllocaInst *AllocA = IRB.CreateAlloca(StructType, nullptr, Arg.getName());

  // Set the alignment to alignment of the byval parameter. This is because,

  // later load/stores assume that alignment, and we are going to replace

  // the use of the byval parameter with this alloca instruction.

  AllocA->setAlignment(

      Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));

  Arg.replaceAllUsesWith(AllocA);


  CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg);


  // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX

  // addrspacecast preserves alignment.  Since params are constant, this load

  // is definitely not volatile.

  const auto ArgSize = *AllocA->getAllocationSize(DL);

  IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),

                   ArgSize);

}

} // namespace


static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {

  Function *Func = Arg->getParent();

  assert(isKernelFunction(*Func));

  const bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();

  const bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);

  const DataLayout &DL = Func->getDataLayout();

  BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();

  [[maybe_unused]] Type *StructType = Arg->getParamByValType();

  assert(StructType && "Missing byval type");


  ArgUseChecker AUC(DL, IsGridConstant);

  ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);

  bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());

  // Easy case, accessing parameter directly is fine.

  if (ArgUseIsReadOnly && AUC.Conditionals.empty()) {

    // Convert all loads and intermediate operations to use parameter AS and

    // skip creation of a local copy of the argument.

    SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));


    IRBuilder<> IRB(&*FirstInst);

    CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);


    for (Use *U : UsesToUpdate)

      convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);

    LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");


    const auto *TLI =

        cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());


    adjustByValArgAlignment(Arg, ArgInParamAS, TLI);


    return;

  }


  // We can't access byval arg directly and need a pointer. on sm_70+ we have

  // ability to take a pointer to the argument without making a local copy.

  // However, we're still not allowed to write to it. If the user specified

  // `__grid_constant__` for the argument, we'll consider escaped pointer as

  // read-only.

  if (IsGridConstant || (HasCvtaParam && ArgUseIsReadOnly)) {

    LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");

    // Replace all argument pointer uses (which might include a device function

    // call) with a cast to the generic address space using cvta.param

    // instruction, which avoids a local copy.

    IRBuilder<> IRB(&Func->getEntryBlock().front());


    // Cast argument to param address space. Because the backend will emit the

    // argument already in the param address space, we need to use the noop

    // intrinsic, this had the added benefit of preventing other optimizations

    // from folding away this pair of addrspacecasts.

    auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg);


    // Cast param address to generic address space.

    Value *GenericArg = IRB.CreateAddrSpaceCast(

        ParamSpaceArg, IRB.getPtrTy(ADDRESS_SPACE_GENERIC),

        Arg->getName() + ".gen");


    Arg->replaceAllUsesWith(GenericArg);


    // Do not replace Arg in the cast to param space

    ParamSpaceArg->setOperand(0, Arg);

  } else

    copyByValParam(*Func, *Arg);

}


static void markPointerAsAS(Value *Ptr, const unsigned AS) {

  if (Ptr->getType()->getPointerAddressSpace() != ADDRESS_SPACE_GENERIC)

    return;


  // Deciding where to emit the addrspacecast pair.

  BasicBlock::iterator InsertPt;

  if (Argument *Arg = dyn_cast<Argument>(Ptr)) {

    // Insert at the functon entry if Ptr is an argument.

    InsertPt = Arg->getParent()->getEntryBlock().begin();

  } else {

    // Insert right after Ptr if Ptr is an instruction.

    InsertPt = ++cast<Instruction>(Ptr)->getIterator();

    assert(InsertPt != InsertPt->getParent()->end() &&

           "We don't call this function with Ptr being a terminator.");

  }


  Instruction *PtrInGlobal = new AddrSpaceCastInst(

      Ptr, PointerType::get(Ptr->getContext(), AS), Ptr->getName(), InsertPt);

  Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(),

                                              Ptr->getName(), InsertPt);

  // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal.

  Ptr->replaceAllUsesWith(PtrInGeneric);

  PtrInGlobal->setOperand(0, Ptr);

}


static void markPointerAsGlobal(Value *Ptr) {

  markPointerAsAS(Ptr, ADDRESS_SPACE_GLOBAL);

}


// =============================================================================

// Main function for this pass.

// =============================================================================


static bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F) {

  // Copying of byval aggregates + SROA may result in pointers being loaded as

  // integers, followed by intotoptr. We may want to mark those as global, too,

  // but only if the loaded integer is used exclusively for conversion to a

  // pointer with inttoptr.

  auto HandleIntToPtr = [](Value &V) {

    if (llvm::all_of(V.users(), [](User *U) { return isa<IntToPtrInst>(U); })) {

      SmallVector<User *, 16> UsersToUpdate(V.users());

      for (User *U : UsersToUpdate)

        markPointerAsGlobal(U);

    }

  };

  if (TM.getDrvInterface() == NVPTX::CUDA) {

    // Mark pointers in byval structs as global.

    for (auto &B : F) {

      for (auto &I : B) {

        if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {

          if (LI->getType()->isPointerTy() || LI->getType()->isIntegerTy()) {

            Value *UO = getUnderlyingObject(LI->getPointerOperand());

            if (Argument *Arg = dyn_cast<Argument>(UO)) {

              if (Arg->hasByValAttr()) {

                // LI is a load from a pointer within a byval kernel parameter.

                if (LI->getType()->isPointerTy())

                  markPointerAsGlobal(LI);

                else

                  HandleIntToPtr(*LI);

              }

            }

          }

        }

      }

    }

  }


  LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n");

  for (Argument &Arg : F.args()) {

    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) {

      handleByValParam(TM, &Arg);

    } else if (Arg.getType()->isIntegerTy() &&

               TM.getDrvInterface() == NVPTX::CUDA) {

      HandleIntToPtr(Arg);

    }

  }

  return true;

}


// Device functions only need to copy byval args into local memory.


static bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F) {

  LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n");


  const auto *TLI =

      cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());


  for (Argument &Arg : F.args())

    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())

      adjustByValArgAlignment(&Arg, &Arg, TLI);


  return true;

}


static bool processFunction(Function &F, NVPTXTargetMachine &TM) {

  return isKernelFunction(F) ? runOnKernelFunction(TM, F)

                             : runOnDeviceFunction(TM, F);

}


bool NVPTXLowerArgsLegacyPass::runOnFunction(Function &F) {

  auto &TM = getAnalysis<TargetPassConfig>().getTM<NVPTXTargetMachine>();

  return processFunction(F, TM);

}


FunctionPass *llvm::createNVPTXLowerArgsPass() {

  return new NVPTXLowerArgsLegacyPass();

}


static bool copyFunctionByValArgs(Function &F) {

  LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName()

                    << "\n");

  bool Changed = false;

  if (isKernelFunction(F)) {

    for (Argument &Arg : F.args())

      if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() &&

          !isParamGridConstant(Arg)) {

        copyByValParam(F, Arg);

        Changed = true;

      }

  }

  return Changed;

}


PreservedAnalyses NVPTXCopyByValArgsPass::run(Function &F,

                                              FunctionAnalysisManager &AM) {

  return copyFunctionByValArgs(F) ? PreservedAnalyses::none()

                                  : PreservedAnalyses::all();

}


PreservedAnalyses NVPTXLowerArgsPass::run(Function &F,

                                          FunctionAnalysisManager &AM) {

  auto &NTM = static_cast<NVPTXTargetMachine &>(TM);

  bool Changed = processFunction(F, NTM);

  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();

}


assert
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")

PHI
Rewrite undef for PHI
Definition AMDGPURewriteUndefForPHI.cpp:98

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition ARMSLSHardening.cpp:73

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition EntryExitInstrumenter.cpp:103

GEP
Hexagon Common GEP
Definition HexagonCommonGEP.cpp:164

IRBuilder.h

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

Function.h

IntrinsicInst.h

Type.h

InitializePasses.h

Instructions.h

F
#define F(x, y, z)
Definition MD5.cpp:55

I
#define I(x, y, z)
Definition MD5.cpp:58

NVPTXAddrSpace.h
NVPTX address space definition.

NVPTXBaseInfo.h

runOnDeviceFunction
static bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F)
Definition NVPTXLowerArgs.cpp:689

createNVVMInternalAddrspaceWrap
static CallInst * createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB, Argument &Arg)
Definition NVPTXLowerArgs.cpp:417

adjustByValArgAlignment
static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, const NVPTXTargetLowering *TLI)
Definition NVPTXLowerArgs.cpp:347

copyFunctionByValArgs
static bool copyFunctionByValArgs(Function &F)
Definition NVPTXLowerArgs.cpp:715

markPointerAsAS
static void markPointerAsAS(Value *Ptr, const unsigned AS)
Definition NVPTXLowerArgs.cpp:610

convertToParamAS
nvptx lower Lower static false void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, bool IsGridConstant)
Definition NVPTXLowerArgs.cpp:210

processFunction
static bool processFunction(Function &F, NVPTXTargetMachine &TM)
Definition NVPTXLowerArgs.cpp:702

runOnKernelFunction
static bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F)
Definition NVPTXLowerArgs.cpp:642

markPointerAsGlobal
static void markPointerAsGlobal(Value *Ptr)
Definition NVPTXLowerArgs.cpp:635

handleByValParam
static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg)
Definition NVPTXLowerArgs.cpp:545

NVPTXTargetMachine.h

NVPTXUtilities.h

NVPTX.h

II
uint64_t IntrinsicInst * II
Definition NVVMIntrRange.cpp:46

INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42

INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44

INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39

Pass.h

PtrUseVisitor.h
This file provides a collection of visitors which walk the (instruction) uses of a pointer.

STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.

Debug.h

LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition Debug.h:114

Ptr
@ Ptr
Definition TargetLibraryInfo.cpp:77

TargetPassConfig.h
Target-Independent Code Generator Pass Configuration Options pass.

ValueTracking.h

llvm::APInt
Class for arbitrary precision integers.
Definition APInt.h:78

llvm::APInt::getLimitedValue
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:475

llvm::APInt::getZero
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:200

llvm::AddrSpaceCastInst
This class represents a conversion between pointers from one address space to another.
Definition Instructions.h:5029

llvm::AddrSpaceCastInst::getDestAddressSpace
unsigned getDestAddressSpace() const
Returns the address space of the result.
Definition Instructions.h:5076

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition Instructions.h:65

llvm::AllocaInst::getAlign
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition Instructions.h:129

llvm::AllocaInst::getAllocationSize
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
Definition Instructions.cpp:65

llvm::AllocaInst::setAlignment
void setAlignment(Align Align)
Definition Instructions.h:133

llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition PassAnalysisSupport.h:48

llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition PassAnalysisSupport.h:76

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition Argument.h:32

llvm::Argument::addAttr
LLVM_ABI void addAttr(Attribute::AttrKind Kind)
Definition Function.cpp:321

llvm::Argument::hasByValAttr
LLVM_ABI bool hasByValAttr() const
Return true if this argument has the byval attribute.
Definition Function.cpp:128

llvm::Argument::removeAttr
LLVM_ABI void removeAttr(Attribute::AttrKind Kind)
Remove attributes from an argument.
Definition Function.cpp:329

llvm::Argument::getParent
const Function * getParent() const
Definition Argument.h:44

llvm::Argument::getParamByValType
LLVM_ABI Type * getParamByValType() const
If this is a byval argument, return its type.
Definition Function.cpp:225

llvm::Argument::getParamAlign
LLVM_ABI MaybeAlign getParamAlign() const
If this is a byval or inalloca argument, return its alignment.
Definition Function.cpp:216

llvm::Attribute::getWithAlignment
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition Attributes.cpp:234

llvm::BasicBlock::begin
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459

llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170

llvm::CallBase::addRetAttr
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition InstrTypes.h:1489

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition Instructions.h:1511

llvm::CastInst::Create
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Definition Instructions.cpp:3044

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63

llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314

llvm::Function
Definition Function.h:64

llvm::Function::getEntryBlock
const BasicBlock & getEntryBlock() const
Definition Function.h:807

llvm::GetElementPtrInst::Create
static GetElementPtrInst * Create(Type *PointeeType, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Definition Instructions.h:974

llvm::IRBuilderBase::CreateIntrinsic
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition IRBuilder.cpp:845

llvm::IRBuilderBase::getPtrTy
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605

llvm::IRBuilderBase::CreateAddrSpaceCast
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2212

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788

llvm::InstVisitor::visit
void visit(Iterator Start, Iterator End)
Definition InstVisitor.h:87

llvm::Instruction
Definition Instruction.h:69

llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition DerivedTypes.h:74

llvm::LoadInst
An instruction for reading from memory.
Definition Instructions.h:181

llvm::NVPTXTargetLowering
Definition NVPTXISelLowering.h:141

llvm::NVPTXTargetLowering::getFunctionParamOptimizedAlign
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
Definition NVPTXISelLowering.cpp:5094

llvm::NVPTXTargetMachine
NVPTXTargetMachine.
Definition NVPTXTargetMachine.h:25

llvm::PointerType::get
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.

llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112

llvm::PreservedAnalyses::none
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115

llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118

llvm::PtrUseVisitor
A base class for visitors over the uses of a pointer value.
Definition PtrUseVisitor.h:209

llvm::PtrUseVisitor< ArgUseChecker >::visitAddrSpaceCastInst
void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC)
Definition PtrUseVisitor.h:264

llvm::PtrUseVisitor< ArgUseChecker >::visitPtrToIntInst
void visitPtrToIntInst(PtrToIntInst &I)
Definition PtrUseVisitor.h:268

llvm::SmallPtrSetImplBase::empty
bool empty() const
Definition SmallPtrSet.h:98

llvm::SmallVectorImpl::pop_back_val
T pop_back_val()
Definition SmallVector.h:673

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition SmallVector.h:416

llvm::SmallVectorTemplateCommon::empty
bool empty() const
Definition SmallVector.h:82

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition SmallVector.h:1196

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55

llvm::StructType
Class to represent struct types.
Definition DerivedTypes.h:218

llvm::TargetPassConfig
Target-Independent Code Generator Pass Configuration Options.
Definition TargetPassConfig.h:84

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45

llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267

llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition Use.h:35

llvm::User
Definition User.h:44

llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition User.h:237

llvm::Value
LLVM Value Representation.
Definition Value.h:75

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256

llvm::Value::replaceAllUsesWith
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546

llvm::Value::users
iterator_range< user_iterator > users()
Definition Value.h:426

llvm::Value::getContext
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099

llvm::Value::uses
iterator_range< use_iterator > uses()
Definition Value.h:380

llvm::Value::getName
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322

uint64_t

Changed
Changed
Definition ObjCARCOpts.cpp:2369

ErrorHandling.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition ErrorHandling.h:164

false
Definition MachinePipeliner.cpp:244

llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24

llvm::Intrinsic::ID
unsigned ID
Definition GenericSSAContext.h:28

llvm::M68k::MemAddrModeKind::U
@ U
Definition M68kBaseInfo.h:61

llvm::NVPTXAS::ADDRESS_SPACE_PARAM
@ ADDRESS_SPACE_PARAM
Definition NVPTXAddrSpace.h:30

llvm::NVPTXAS::ADDRESS_SPACE_GENERIC
@ ADDRESS_SPACE_GENERIC
Definition NVPTXAddrSpace.h:22

llvm::NVPTXAS::ADDRESS_SPACE_GLOBAL
@ ADDRESS_SPACE_GLOBAL
Definition NVPTXAddrSpace.h:23

llvm::NVPTX::CUDA
@ CUDA
Definition NVPTX.h:118

llvm::SIEncodingFamily::SI
@ SI
Definition SIDefines.h:36

llvm::SI
Definition SIInstrInfo.h:1754

llvm::SystemZISD::TM
@ TM
Definition SystemZISelLowering.h:66

llvm::sandboxir::Instruction
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition AddressRanges.h:18

llvm::Offset
@ Offset
Definition DWP.cpp:477

llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725

llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472

llvm::dyn_cast
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643

llvm::createNVPTXLowerArgsPass
FunctionPass * createNVPTXLowerArgsPass()
Definition NVPTXLowerArgs.cpp:711

llvm::reverse
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406

llvm::dbgs
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207

llvm::isa
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547

llvm::isParamGridConstant
bool isParamGridConstant(const Argument &Arg)
Definition NVPTXUtilities.cpp:210

llvm::isKernelFunction
bool isKernelFunction(const Function &F)
Definition NVPTXUtilities.h:65

llvm::cast
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559

llvm::make_pointer_range
iterator_range< pointer_iterator< WrappedIteratorT > > make_pointer_range(RangeT &&Range)
Definition iterator.h:363

llvm::FunctionAnalysisManager
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
Definition PassManager.h:564

llvm::getUnderlyingObject
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition ValueTracking.cpp:6683

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39

llvm::Align::value
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77

llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106

llvm::MaybeAlign::valueOrOne
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130

llvm::NVPTXCopyByValArgsPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition NVPTXLowerArgs.cpp:730

llvm::NVPTXLowerArgsPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition NVPTXLowerArgs.cpp:736