doxygen/AMDGPUTargetTransformInfo_8cpp_source.html

//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

// \file

// This file implements a TargetTransformInfo analysis pass specific to the

// AMDGPU target machine. It uses the target's detailed information to provide

// more precise answers to certain TTI queries, while letting the target

// independent and default TTI implementations handle the rest.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUTargetTransformInfo.h"

#include "AMDGPUTargetMachine.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIModeRegisterDefaults.h"

#include "llvm/Analysis/InlineCost.h"

#include "llvm/Analysis/LoopInfo.h"

#include "llvm/Analysis/ValueTracking.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/IR/IRBuilder.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/PatternMatch.h"

#include "llvm/Support/KnownBits.h"

#include <optional>


using namespace llvm;


#define DEBUG_TYPE "AMDGPUtti"


static cl::opt<unsigned> UnrollThresholdPrivate(

  "amdgpu-unroll-threshold-private",

  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),

  cl::init(2700), cl::Hidden);


static cl::opt<unsigned> UnrollThresholdLocal(

  "amdgpu-unroll-threshold-local",

  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),

  cl::init(1000), cl::Hidden);


static cl::opt<unsigned> UnrollThresholdIf(

  "amdgpu-unroll-threshold-if",

  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),

  cl::init(200), cl::Hidden);


static cl::opt<bool> UnrollRuntimeLocal(

  "amdgpu-unroll-runtime-local",

  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),

  cl::init(true), cl::Hidden);


static cl::opt<unsigned> UnrollMaxBlockToAnalyze(

    "amdgpu-unroll-max-block-to-analyze",

    cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),

    cl::init(32), cl::Hidden);


static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",

                                       cl::Hidden, cl::init(4000),

                                       cl::desc("Cost of alloca argument"));


// If the amount of scratch memory to eliminate exceeds our ability to allocate

// it into registers we gain nothing by aggressively inlining functions for that

// heuristic.

static cl::opt<unsigned>

    ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,

                    cl::init(256),

                    cl::desc("Maximum alloca size to use for inline cost"));


// Inliner constraint to achieve reasonable compilation time.

static cl::opt<size_t> InlineMaxBB(

    "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),

    cl::desc("Maximum number of BBs allowed in a function after inlining"

             " (compile time constraint)"));


static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,

                              unsigned Depth = 0) {

  const Instruction *I = dyn_cast<Instruction>(Cond);

  if (!I)

    return false;


  for (const Value *V : I->operand_values()) {

    if (!L->contains(I))

      continue;

    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {

      if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {

                  return SubLoop->contains(PHI); }))

        return true;

    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))

      return true;

  }

  return false;

}


AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

    : BaseT(TM, F.getDataLayout()),

      TargetTriple(TM->getTargetTriple()),

      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),

      TLI(ST->getTargetLowering()) {}


void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

                                            TTI::UnrollingPreferences &UP,

                                            OptimizationRemarkEmitter *ORE) {

  const Function &F = *L->getHeader()->getParent();

  UP.Threshold =

      F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);

  UP.MaxCount = std::numeric_limits<unsigned>::max();

  UP.Partial = true;


  // Conditional branch in a loop back edge needs 3 additional exec

  // manipulations in average.

  UP.BEInsns += 3;


  // We want to run unroll even for the loops which have been vectorized.

  UP.UnrollVectorizedLoop = true;


  // TODO: Do we want runtime unrolling?


  // Maximum alloca size than can fit registers. Reserve 16 registers.

  const unsigned MaxAlloca = (256 - 16) * 4;

  unsigned ThresholdPrivate = UnrollThresholdPrivate;

  unsigned ThresholdLocal = UnrollThresholdLocal;


  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the

  // provided threshold value as the default for Threshold

  if (MDNode *LoopUnrollThreshold =

          findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {

    if (LoopUnrollThreshold->getNumOperands() == 2) {

      ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(

          LoopUnrollThreshold->getOperand(1));

      if (MetaThresholdValue) {

        // We will also use the supplied value for PartialThreshold for now.

        // We may introduce additional metadata if it becomes necessary in the

        // future.

        UP.Threshold = MetaThresholdValue->getSExtValue();

        UP.PartialThreshold = UP.Threshold;

        ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);

        ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);

      }

    }

  }


  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);

  for (const BasicBlock *BB : L->getBlocks()) {

    const DataLayout &DL = BB->getDataLayout();

    unsigned LocalGEPsSeen = 0;


    if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {

               return SubLoop->contains(BB); }))

        continue; // Block belongs to an inner loop.


    for (const Instruction &I : *BB) {

      // Unroll a loop which contains an "if" statement whose condition

      // defined by a PHI belonging to the loop. This may help to eliminate

      // if region and potentially even PHI itself, saving on both divergence

      // and registers used for the PHI.

      // Add a small bonus for each of such "if" statements.

      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {

        if (UP.Threshold < MaxBoost && Br->isConditional()) {

          BasicBlock *Succ0 = Br->getSuccessor(0);

          BasicBlock *Succ1 = Br->getSuccessor(1);

          if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||

              (L->contains(Succ1) && L->isLoopExiting(Succ1)))

            continue;

          if (dependsOnLocalPhi(L, Br->getCondition())) {

            UP.Threshold += UnrollThresholdIf;

            LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold

                              << " for loop:\n"

                              << *L << " due to " << *Br << '\n');

            if (UP.Threshold >= MaxBoost)

              return;

          }

        }

        continue;

      }


      const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);

      if (!GEP)

        continue;


      unsigned AS = GEP->getAddressSpace();

      unsigned Threshold = 0;

      if (AS == AMDGPUAS::PRIVATE_ADDRESS)

        Threshold = ThresholdPrivate;

      else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)

        Threshold = ThresholdLocal;

      else

        continue;


      if (UP.Threshold >= Threshold)

        continue;


      if (AS == AMDGPUAS::PRIVATE_ADDRESS) {

        const Value *Ptr = GEP->getPointerOperand();

        const AllocaInst *Alloca =

            dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));

        if (!Alloca || !Alloca->isStaticAlloca())

          continue;

        Type *Ty = Alloca->getAllocatedType();

        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;

        if (AllocaSize > MaxAlloca)

          continue;

      } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||

                 AS == AMDGPUAS::REGION_ADDRESS) {

        LocalGEPsSeen++;

        // Inhibit unroll for local memory if we have seen addressing not to

        // a variable, most likely we will be unable to combine it.

        // Do not unroll too deep inner loops for local memory to give a chance

        // to unroll an outer loop for a more important reason.

        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||

            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&

             !isa<Argument>(GEP->getPointerOperand())))

          continue;

        LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"

                          << *L << " due to LDS use.\n");

        UP.Runtime = UnrollRuntimeLocal;

      }


      // Check if GEP depends on a value defined by this loop itself.

      bool HasLoopDef = false;

      for (const Value *Op : GEP->operands()) {

        const Instruction *Inst = dyn_cast<Instruction>(Op);

        if (!Inst || L->isLoopInvariant(Op))

          continue;


        if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {

             return SubLoop->contains(Inst); }))

          continue;

        HasLoopDef = true;

        break;

      }

      if (!HasLoopDef)

        continue;


      // We want to do whatever we can to limit the number of alloca

      // instructions that make it through to the code generator.  allocas

      // require us to use indirect addressing, which is slow and prone to

      // compiler bugs.  If this loop does an address calculation on an

      // alloca ptr, then we want to use a higher than normal loop unroll

      // threshold. This will give SROA a better chance to eliminate these

      // allocas.

      //

      // We also want to have more unrolling for local memory to let ds

      // instructions with different offsets combine.

      //

      // Don't use the maximum allowed value here as it will make some

      // programs way too big.

      UP.Threshold = Threshold;

      LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold

                        << " for loop:\n"

                        << *L << " due to " << *GEP << '\n');

      if (UP.Threshold >= MaxBoost)

        return;

    }


    // If we got a GEP in a small BB from inner loop then increase max trip

    // count to analyze for better estimation cost in unroll

    if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)

      UP.MaxIterationsCountToAnalyze = 32;

  }

}


void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                          TTI::PeelingPreferences &PP) {

  BaseT::getPeelingPreferences(L, SE, PP);

}


int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {

  return 1024;

}


const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {

    // Codegen control options which don't matter.

    AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,

    AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,

    AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,

    AMDGPU::FeatureUnalignedAccessMode,


    AMDGPU::FeatureAutoWaitcntBeforeBarrier,


    // Property of the kernel/environment which can't actually differ.

    AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,

    AMDGPU::FeatureTrapHandler,


    // The default assumption needs to be ecc is enabled, but no directly

    // exposed operations depend on it, so it can be safely inlined.

    AMDGPU::FeatureSRAMECC,


    // Perf-tuning features

    AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};


GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)

    : BaseT(TM, F.getDataLayout()),

      ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),

      TLI(ST->getTargetLowering()), CommonTTI(TM, F),

      IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {

  SIModeRegisterDefaults Mode(F, *ST);

  HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();

  HasFP64FP16Denormals =

      Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();

}


bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {

  return !F || !ST->isSingleLaneExecution(*F);

}


unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {

  // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector

  // registers. See getRegisterClassForType for the implementation.

  // In this case vector registers are not vector in terms of

  // VGPRs, but those which can hold multiple values.


  // This is really the number of registers to fill when vectorizing /

  // interleaving loops, so we lie to avoid trying to use all registers.

  return 4;

}


TypeSize

GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {

  switch (K) {

  case TargetTransformInfo::RGK_Scalar:

    return TypeSize::getFixed(32);

  case TargetTransformInfo::RGK_FixedWidthVector:

    return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);

  case TargetTransformInfo::RGK_ScalableVector:

    return TypeSize::getScalable(0);

  }

  llvm_unreachable("Unsupported register kind");

}


unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {

  return 32;

}


unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {

  if (Opcode == Instruction::Load || Opcode == Instruction::Store)

    return 32 * 4 / ElemWidth;

  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2

       : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2

       : 1;

}


unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,

                                         unsigned ChainSizeInBytes,

                                         VectorType *VecTy) const {

  unsigned VecRegBitWidth = VF * LoadSize;

  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)

    // TODO: Support element-size less than 32bit?

    return 128 / LoadSize;


  return VF;

}


unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,

                                             unsigned ChainSizeInBytes,

                                             VectorType *VecTy) const {

  unsigned VecRegBitWidth = VF * StoreSize;

  if (VecRegBitWidth > 128)

    return 128 / StoreSize;


  return VF;

}


unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {

  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||

      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||

      AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||

      AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||

      AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||

      AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {

    return 512;

  }


  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)

    return 8 * ST->getMaxPrivateElementSize();


  // Common to flat, global, local and region. Assume for unknown addrspace.

  return 128;

}


bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,

                                            Align Alignment,

                                            unsigned AddrSpace) const {

  // We allow vectorization of flat stores, even though we may need to decompose

  // them later if they may access private memory. We don't have enough context

  // here, and legalization can handle it.

  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {

    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&

      ChainSizeInBytes <= ST->getMaxPrivateElementSize();

  }

  return true;

}


bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,

                                             Align Alignment,

                                             unsigned AddrSpace) const {

  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);

}


bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,

                                              Align Alignment,

                                              unsigned AddrSpace) const {

  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);

}


int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {

  return 1024;

}


// FIXME: Really we would like to issue multiple 128-bit loads and stores per

// iteration. Should we report a larger size and let it legalize?

//

// FIXME: Should we use narrower types for local/region, or account for when

// unaligned access is legal?

//

// FIXME: This could use fine tuning and microbenchmarks.

Type *GCNTTIImpl::getMemcpyLoopLoweringType(

    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,

    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,

    std::optional<uint32_t> AtomicElementSize) const {


  if (AtomicElementSize)

    return Type::getIntNTy(Context, *AtomicElementSize * 8);


  unsigned MinAlign = std::min(SrcAlign, DestAlign);


  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the

  // hardware into byte accesses. If you assume all alignments are equally

  // probable, it's more efficient on average to use short accesses for this

  // case.

  if (MinAlign == 2)

    return Type::getInt16Ty(Context);


  // Not all subtargets have 128-bit DS instructions, and we currently don't

  // form them by default.

  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||

      SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||

      DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||

      DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {

    return FixedVectorType::get(Type::getInt32Ty(Context), 2);

  }


  // Global memory works best with 16-byte accesses. Private memory will also

  // hit this, although they'll be decomposed.

  return FixedVectorType::get(Type::getInt32Ty(Context), 4);

}


void GCNTTIImpl::getMemcpyLoopResidualLoweringType(

    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,

    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,

    unsigned SrcAlign, unsigned DestAlign,

    std::optional<uint32_t> AtomicCpySize) const {

  assert(RemainingBytes < 16);


  if (AtomicCpySize)

    BaseT::getMemcpyLoopResidualLoweringType(

        OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,

        DestAlign, AtomicCpySize);


  unsigned MinAlign = std::min(SrcAlign, DestAlign);


  if (MinAlign != 2) {

    Type *I64Ty = Type::getInt64Ty(Context);

    while (RemainingBytes >= 8) {

      OpsOut.push_back(I64Ty);

      RemainingBytes -= 8;

    }


    Type *I32Ty = Type::getInt32Ty(Context);

    while (RemainingBytes >= 4) {

      OpsOut.push_back(I32Ty);

      RemainingBytes -= 4;

    }

  }


  Type *I16Ty = Type::getInt16Ty(Context);

  while (RemainingBytes >= 2) {

    OpsOut.push_back(I16Ty);

    RemainingBytes -= 2;

  }


  Type *I8Ty = Type::getInt8Ty(Context);

  while (RemainingBytes) {

    OpsOut.push_back(I8Ty);

    --RemainingBytes;

  }

}


unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {

  // Disable unrolling if the loop is not vectorized.

  // TODO: Enable this again.

  if (VF.isScalar())

    return 1;


  return 8;

}


bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,

                                       MemIntrinsicInfo &Info) const {

  switch (Inst->getIntrinsicID()) {

  case Intrinsic::amdgcn_ds_ordered_add:

  case Intrinsic::amdgcn_ds_ordered_swap: {

    auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));

    auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));

    if (!Ordering || !Volatile)

      return false; // Invalid.


    unsigned OrderingVal = Ordering->getZExtValue();

    if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))

      return false;


    Info.PtrVal = Inst->getArgOperand(0);

    Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);

    Info.ReadMem = true;

    Info.WriteMem = true;

    Info.IsVolatile = !Volatile->isZero();

    return true;

  }

  default:

    return false;

  }

}


InstructionCost GCNTTIImpl::getArithmeticInstrCost(

    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,

    TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,

    ArrayRef<const Value *> Args,

    const Instruction *CxtI) {


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  int ISD = TLI->InstructionOpcodeToISD(Opcode);


  // Because we don't have any legal vector operations, but the legal types, we

  // need to account for split vectors.

  unsigned NElts = LT.second.isVector() ?

    LT.second.getVectorNumElements() : 1;


  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;


  switch (ISD) {

  case ISD::SHL:

  case ISD::SRL:

  case ISD::SRA:

    if (SLT == MVT::i64)

      return get64BitInstrCost(CostKind) * LT.first * NElts;


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    // i32

    return getFullRateInstrCost() * LT.first * NElts;

  case ISD::ADD:

  case ISD::SUB:

  case ISD::AND:

  case ISD::OR:

  case ISD::XOR:

    if (SLT == MVT::i64) {

      // and, or and xor are typically split into 2 VALU instructions.

      return 2 * getFullRateInstrCost() * LT.first * NElts;

    }


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    return LT.first * NElts * getFullRateInstrCost();

  case ISD::MUL: {

    const int QuarterRateCost = getQuarterRateInstrCost(CostKind);

    if (SLT == MVT::i64) {

      const int FullRateCost = getFullRateInstrCost();

      return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;

    }


    if (ST->has16BitInsts() && SLT == MVT::i16)

      NElts = (NElts + 1) / 2;


    // i32

    return QuarterRateCost * NElts * LT.first;

  }

  case ISD::FMUL:

    // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for

    // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole

    // fused operation.

    if (CxtI && CxtI->hasOneUse())

      if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {

        const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());

        if (OPC == ISD::FADD || OPC == ISD::FSUB) {

          if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)

            return TargetTransformInfo::TCC_Free;

          if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)

            return TargetTransformInfo::TCC_Free;


          // Estimate all types may be fused with contract/unsafe flags

          const TargetOptions &Options = TLI->getTargetMachine().Options;

          if (Options.AllowFPOpFusion == FPOpFusion::Fast ||

              Options.UnsafeFPMath ||

              (FAdd->hasAllowContract() && CxtI->hasAllowContract()))

            return TargetTransformInfo::TCC_Free;

        }

      }

    [[fallthrough]];

  case ISD::FADD:

  case ISD::FSUB:

    if (ST->hasPackedFP32Ops() && SLT == MVT::f32)

      NElts = (NElts + 1) / 2;

    if (SLT == MVT::f64)

      return LT.first * NElts * get64BitInstrCost(CostKind);


    if (ST->has16BitInsts() && SLT == MVT::f16)

      NElts = (NElts + 1) / 2;


    if (SLT == MVT::f32 || SLT == MVT::f16)

      return LT.first * NElts * getFullRateInstrCost();

    break;

  case ISD::FDIV:

  case ISD::FREM:

    // FIXME: frem should be handled separately. The fdiv in it is most of it,

    // but the current lowering is also not entirely correct.

    if (SLT == MVT::f64) {

      int Cost = 7 * get64BitInstrCost(CostKind) +

                 getQuarterRateInstrCost(CostKind) +

                 3 * getHalfRateInstrCost(CostKind);

      // Add cost of workaround.

      if (!ST->hasUsableDivScaleConditionOutput())

        Cost += 3 * getFullRateInstrCost();


      return LT.first * Cost * NElts;

    }


    if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {

      // TODO: This is more complicated, unsafe flags etc.

      if ((SLT == MVT::f32 && !HasFP32Denormals) ||

          (SLT == MVT::f16 && ST->has16BitInsts())) {

        return LT.first * getQuarterRateInstrCost(CostKind) * NElts;

      }

    }


    if (SLT == MVT::f16 && ST->has16BitInsts()) {

      // 2 x v_cvt_f32_f16

      // f32 rcp

      // f32 fmul

      // v_cvt_f16_f32

      // f16 div_fixup

      int Cost =

          4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);

      return LT.first * Cost * NElts;

    }


    if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||

                            TLI->getTargetMachine().Options.UnsafeFPMath)) {

      // Fast unsafe fdiv lowering:

      // f32 rcp

      // f32 fmul

      int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();

      return LT.first * Cost * NElts;

    }


    if (SLT == MVT::f32 || SLT == MVT::f16) {

      // 4 more v_cvt_* insts without f16 insts support

      int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +

                 1 * getQuarterRateInstrCost(CostKind);


      if (!HasFP32Denormals) {

        // FP mode switches.

        Cost += 2 * getFullRateInstrCost();

      }


      return LT.first * NElts * Cost;

    }

    break;

  case ISD::FNEG:

    // Use the backend' estimation. If fneg is not free each element will cost

    // one additional instruction.

    return TLI->isFNegFree(SLT) ? 0 : NElts;

  default:

    break;

  }


  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,

                                       Args, CxtI);

}


// Return true if there's a potential benefit from using v2f16/v2i16

// instructions for an intrinsic, even if it requires nontrivial legalization.

static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {

  switch (ID) {

  case Intrinsic::fma: // TODO: fmuladd

  // There's a small benefit to using vector ops in the legalized code.

  case Intrinsic::round:

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat:

    return true;

  default:

    return false;

  }

}


InstructionCost

GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,

                                  TTI::TargetCostKind CostKind) {

  if (ICA.getID() == Intrinsic::fabs)

    return 0;


  if (!intrinsicHasPackedVectorBenefit(ICA.getID()))

    return BaseT::getIntrinsicInstrCost(ICA, CostKind);


  Type *RetTy = ICA.getReturnType();


  // Legalize the type.

  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);


  unsigned NElts = LT.second.isVector() ?

    LT.second.getVectorNumElements() : 1;


  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;


  if (SLT == MVT::f64)

    return LT.first * NElts * get64BitInstrCost(CostKind);


  if ((ST->has16BitInsts() && SLT == MVT::f16) ||

      (ST->hasPackedFP32Ops() && SLT == MVT::f32))

    NElts = (NElts + 1) / 2;


  // TODO: Get more refined intrinsic costs?

  unsigned InstRate = getQuarterRateInstrCost(CostKind);


  switch (ICA.getID()) {

  case Intrinsic::fma:

    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)

                                   : getQuarterRateInstrCost(CostKind);

    break;

  case Intrinsic::uadd_sat:

  case Intrinsic::usub_sat:

  case Intrinsic::sadd_sat:

  case Intrinsic::ssub_sat:

    static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};

    if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))

      NElts = 1;

    break;

  }


  return LT.first * NElts * InstRate;

}


InstructionCost GCNTTIImpl::getCFInstrCost(unsigned Opcode,

                                           TTI::TargetCostKind CostKind,

                                           const Instruction *I) {

  assert((I == nullptr || I->getOpcode() == Opcode) &&

         "Opcode should reflect passed instruction.");

  const bool SCost =

      (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency);

  const int CBrCost = SCost ? 5 : 7;

  switch (Opcode) {

  case Instruction::Br: {

    // Branch instruction takes about 4 slots on gfx900.

    auto BI = dyn_cast_or_null<BranchInst>(I);

    if (BI && BI->isUnconditional())

      return SCost ? 1 : 4;

    // Suppose conditional branch takes additional 3 exec manipulations

    // instructions in average.

    return CBrCost;

  }

  case Instruction::Switch: {

    auto SI = dyn_cast_or_null<SwitchInst>(I);

    // Each case (including default) takes 1 cmp + 1 cbr instructions in

    // average.

    return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);

  }

  case Instruction::Ret:

    return SCost ? 1 : 10;

  }

  return BaseT::getCFInstrCost(Opcode, CostKind, I);

}


InstructionCost

GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,

                                       std::optional<FastMathFlags> FMF,

                                       TTI::TargetCostKind CostKind) {

  if (TTI::requiresOrderedReduction(FMF))

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  EVT OrigTy = TLI->getValueType(DL, Ty);


  // Computes cost on targets that have packed math instructions(which support

  // 16-bit types only).

  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)

    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  return LT.first * getFullRateInstrCost();

}


InstructionCost

GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,

                                   FastMathFlags FMF,

                                   TTI::TargetCostKind CostKind) {

  EVT OrigTy = TLI->getValueType(DL, Ty);


  // Computes cost on targets that have packed math instructions(which support

  // 16-bit types only).

  if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)

    return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);


  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);

  return LT.first * getHalfRateInstrCost(CostKind);

}


InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,

                                               TTI::TargetCostKind CostKind,

                                               unsigned Index, Value *Op0,

                                               Value *Op1) {

  switch (Opcode) {

  case Instruction::ExtractElement:

  case Instruction::InsertElement: {

    unsigned EltSize

      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());

    if (EltSize < 32) {

      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())

        return 0;

      return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,

                                       Op1);

    }


    // Extracts are just reads of a subregister, so are free. Inserts are

    // considered free because we don't want to have any cost for scalarizing

    // operations, and we don't have to copy into a different register class.


    // Dynamic indexing isn't free and is best avoided.

    return Index == ~0u ? 2 : 0;

  }

  default:

    return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);

  }

}


/// Analyze if the results of inline asm are divergent. If \p Indices is empty,

/// this is analyzing the collective result of all output registers. Otherwise,

/// this is only querying a specific result index if this returns multiple

/// registers in a struct.

bool GCNTTIImpl::isInlineAsmSourceOfDivergence(

  const CallInst *CI, ArrayRef<unsigned> Indices) const {

  // TODO: Handle complex extract indices

  if (Indices.size() > 1)

    return true;


  const DataLayout &DL = CI->getDataLayout();

  const SIRegisterInfo *TRI = ST->getRegisterInfo();

  TargetLowering::AsmOperandInfoVector TargetConstraints =

      TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);


  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];


  int OutputIdx = 0;

  for (auto &TC : TargetConstraints) {

    if (TC.Type != InlineAsm::isOutput)

      continue;


    // Skip outputs we don't care about.

    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)

      continue;


    TLI->ComputeConstraintToUse(TC, SDValue());


    const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(

        TRI, TC.ConstraintCode, TC.ConstraintVT).second;


    // For AGPR constraints null is returned on subtargets without AGPRs, so

    // assume divergent for null.

    if (!RC || !TRI->isSGPRClass(RC))

      return true;

  }


  return false;

}


bool GCNTTIImpl::isReadRegisterSourceOfDivergence(

    const IntrinsicInst *ReadReg) const {

  Metadata *MD =

      cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();

  StringRef RegName =

      cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();


  // Special case registers that look like VCC.

  MVT VT = MVT::getVT(ReadReg->getType());

  if (VT == MVT::i1)

    return true;


  // Special case scalar registers that start with 'v'.

  if (RegName.starts_with("vcc") || RegName.empty())

    return false;


  // VGPR or AGPR is divergent. There aren't any specially named vector

  // registers.

  return RegName[0] == 'v' || RegName[0] == 'a';

}


/// \returns true if the result of the value could potentially be

/// different across workitems in a wavefront.

bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {

  if (const Argument *A = dyn_cast<Argument>(V))

    return !AMDGPU::isArgPassedInSGPR(A);


  // Loads from the private and flat address spaces are divergent, because

  // threads can execute the load instruction with the same inputs and get

  // different results.

  //

  // All other loads are not divergent, because if threads issue loads with the

  // same arguments, they will always get the same result.

  if (const LoadInst *Load = dyn_cast<LoadInst>(V))

    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||

           Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;


  // Atomics are divergent because they are executed sequentially: when an

  // atomic operation refers to the same address in each thread, then each

  // thread after the first sees the value written by the previous thread as

  // original value.

  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))

    return true;


  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {

    if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)

      return isReadRegisterSourceOfDivergence(Intrinsic);


    return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());

  }


  // Assume all function calls are a source of divergence.

  if (const CallInst *CI = dyn_cast<CallInst>(V)) {

    if (CI->isInlineAsm())

      return isInlineAsmSourceOfDivergence(CI);

    return true;

  }


  // Assume all function calls are a source of divergence.

  if (isa<InvokeInst>(V))

    return true;


  return false;

}


bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {

  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))

    return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());


  if (const CallInst *CI = dyn_cast<CallInst>(V)) {

    if (CI->isInlineAsm())

      return !isInlineAsmSourceOfDivergence(CI);

    return false;

  }


  // In most cases TID / wavefrontsize is uniform.

  //

  // However, if a kernel has uneven dimesions we can have a value of

  // workitem-id-x divided by the wavefrontsize non-uniform. For example

  // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)

  // packed into a same wave which gives 1 and 0 after the division by 64

  // respectively.

  //

  // FIXME: limit it to 1D kernels only, although that shall be possible

  // to perform this optimization is the size of the X dimension is a power

  // of 2, we just do not currently have infrastructure to query it.

  using namespace llvm::PatternMatch;

  uint64_t C;

  if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                      m_ConstantInt(C))) ||

      match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                      m_ConstantInt(C)))) {

    const Function *F = cast<Instruction>(V)->getFunction();

    return C >= ST->getWavefrontSizeLog2() &&

           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;

  }


  Value *Mask;

  if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),

                       m_Value(Mask)))) {

    const Function *F = cast<Instruction>(V)->getFunction();

    const DataLayout &DL = F->getDataLayout();

    return computeKnownBits(Mask, DL).countMinTrailingZeros() >=

               ST->getWavefrontSizeLog2() &&

           ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;

  }


  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);

  if (!ExtValue)

    return false;


  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));

  if (!CI)

    return false;


  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {

    switch (Intrinsic->getIntrinsicID()) {

    default:

      return false;

    case Intrinsic::amdgcn_if:

    case Intrinsic::amdgcn_else: {

      ArrayRef<unsigned> Indices = ExtValue->getIndices();

      return Indices.size() == 1 && Indices[0] == 1;

    }

    }

  }


  // If we have inline asm returning mixed SGPR and VGPR results, we inferred

  // divergent for the overall struct return. We need to override it in the

  // case we're extracting an SGPR component here.

  if (CI->isInlineAsm())

    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());


  return false;

}


bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,

                                            Intrinsic::ID IID) const {

  switch (IID) {

  case Intrinsic::amdgcn_is_shared:

  case Intrinsic::amdgcn_is_private:

  case Intrinsic::amdgcn_flat_atomic_fadd:

  case Intrinsic::amdgcn_flat_atomic_fmax:

  case Intrinsic::amdgcn_flat_atomic_fmin:

  case Intrinsic::amdgcn_flat_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num:

    OpIndexes.push_back(0);

    return true;

  default:

    return false;

  }

}


Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,

                                                    Value *OldV,

                                                    Value *NewV) const {

  auto IntrID = II->getIntrinsicID();

  switch (IntrID) {

  case Intrinsic::amdgcn_is_shared:

  case Intrinsic::amdgcn_is_private: {

    unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?

      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;

    unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    LLVMContext &Ctx = NewV->getType()->getContext();

    ConstantInt *NewVal = (TrueAS == NewAS) ?

      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);

    return NewVal;

  }

  case Intrinsic::ptrmask: {

    unsigned OldAS = OldV->getType()->getPointerAddressSpace();

    unsigned NewAS = NewV->getType()->getPointerAddressSpace();

    Value *MaskOp = II->getArgOperand(1);

    Type *MaskTy = MaskOp->getType();


    bool DoTruncate = false;


    const GCNTargetMachine &TM =

        static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());

    if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {

      // All valid 64-bit to 32-bit casts work by chopping off the high

      // bits. Any masking only clearing the low bits will also apply in the new

      // address space.

      if (DL.getPointerSizeInBits(OldAS) != 64 ||

          DL.getPointerSizeInBits(NewAS) != 32)

        return nullptr;


      // TODO: Do we need to thread more context in here?

      KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);

      if (Known.countMinLeadingOnes() < 32)

        return nullptr;


      DoTruncate = true;

    }


    IRBuilder<> B(II);

    if (DoTruncate) {

      MaskTy = B.getInt32Ty();

      MaskOp = B.CreateTrunc(MaskOp, MaskTy);

    }


    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},

                             {NewV, MaskOp});

  }

  case Intrinsic::amdgcn_flat_atomic_fadd:

  case Intrinsic::amdgcn_flat_atomic_fmax:

  case Intrinsic::amdgcn_flat_atomic_fmin:

  case Intrinsic::amdgcn_flat_atomic_fmax_num:

  case Intrinsic::amdgcn_flat_atomic_fmin_num: {

    Type *DestTy = II->getType();

    Type *SrcTy = NewV->getType();

    unsigned NewAS = SrcTy->getPointerAddressSpace();

    if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))

      return nullptr;

    Module *M = II->getModule();

    Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),

                                                  {DestTy, SrcTy, DestTy});

    II->setArgOperand(0, NewV);

    II->setCalledFunction(NewDecl);

    return II;

  }

  default:

    return nullptr;

  }

}


InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

                                           VectorType *VT, ArrayRef<int> Mask,

                                           TTI::TargetCostKind CostKind,

                                           int Index, VectorType *SubTp,

                                           ArrayRef<const Value *> Args,

                                           const Instruction *CxtI) {

  if (!isa<FixedVectorType>(VT))

    return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);


  Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);


  // Larger vector widths may require additional instructions, but are

  // typically cheaper than scalarized versions.

  unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();

  if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&

      DL.getTypeSizeInBits(VT->getElementType()) == 16) {

    bool HasVOP3P = ST->hasVOP3PInsts();

    unsigned RequestedElts =

        count_if(Mask, [](int MaskElt) { return MaskElt != -1; });

    if (RequestedElts == 0)

      return 0;

    switch (Kind) {

    case TTI::SK_Broadcast:

    case TTI::SK_Reverse:

    case TTI::SK_PermuteSingleSrc: {

      // With op_sel VOP3P instructions freely can access the low half or high

      // half of a register, so any swizzle of two elements is free.

      if (HasVOP3P && NumVectorElts == 2)

        return 0;

      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;

      // SK_Broadcast just reuses the same mask

      unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;

      return NumPerms + NumPermMasks;

    }

    case TTI::SK_ExtractSubvector:

    case TTI::SK_InsertSubvector: {

      // Even aligned accesses are free

      if (!(Index % 2))

        return 0;

      // Insert/extract subvectors only require shifts / extract code to get the

      // relevant bits

      return alignTo(RequestedElts, 2) / 2;

    }

    case TTI::SK_PermuteTwoSrc:

    case TTI::SK_Splice:

    case TTI::SK_Select: {

      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;

      // SK_Select just reuses the same mask

      unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;

      return NumPerms + NumPermMasks;

    }


    default:

      break;

    }

  }


  return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);

}


bool GCNTTIImpl::areInlineCompatible(const Function *Caller,

                                     const Function *Callee) const {

  const TargetMachine &TM = getTLI()->getTargetMachine();

  const GCNSubtarget *CallerST

    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));

  const GCNSubtarget *CalleeST

    = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));


  const FeatureBitset &CallerBits = CallerST->getFeatureBits();

  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();


  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;

  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;

  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)

    return false;


  // FIXME: dx10_clamp can just take the caller setting, but there seems to be

  // no way to support merge for backend defined attributes.

  SIModeRegisterDefaults CallerMode(*Caller, *CallerST);

  SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);

  if (!CallerMode.isInlineCompatible(CalleeMode))

    return false;


  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||

      Callee->hasFnAttribute(Attribute::InlineHint))

    return true;


  // Hack to make compile times reasonable.

  if (InlineMaxBB) {

    // Single BB does not increase total BB amount.

    if (Callee->size() == 1)

      return true;

    size_t BBSize = Caller->size() + Callee->size() - 1;

    return BBSize <= InlineMaxBB;

  }


  return true;

}


static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,

                                                   const SITargetLowering *TLI,

                                                   const GCNTTIImpl *TTIImpl) {

  const int NrOfSGPRUntilSpill = 26;

  const int NrOfVGPRUntilSpill = 32;


  const DataLayout &DL = TTIImpl->getDataLayout();


  unsigned adjustThreshold = 0;

  int SGPRsInUse = 0;

  int VGPRsInUse = 0;

  for (const Use &A : CB->args()) {

    SmallVector<EVT, 4> ValueVTs;

    ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);

    for (auto ArgVT : ValueVTs) {

      unsigned CCRegNum = TLI->getNumRegistersForCallingConv(

          CB->getContext(), CB->getCallingConv(), ArgVT);

      if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))

        SGPRsInUse += CCRegNum;

      else

        VGPRsInUse += CCRegNum;

    }

  }


  // The cost of passing function arguments through the stack:

  //  1 instruction to put a function argument on the stack in the caller.

  //  1 instruction to take a function argument from the stack in callee.

  //  1 instruction is explicitly take care of data dependencies in callee

  //  function.

  InstructionCost ArgStackCost(1);

  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

      Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),

      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);

  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(

      Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),

      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);


  // The penalty cost is computed relative to the cost of instructions and does

  // not model any storage costs.

  adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *

                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();

  adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *

                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();

  return adjustThreshold;

}


static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,

                                           const DataLayout &DL) {

  // If we have a pointer to a private array passed into a function

  // it will not be optimized out, leaving scratch usage.

  // This function calculates the total size in bytes of the memory that would

  // end in scratch if the call was not inlined.

  unsigned AllocaSize = 0;

  SmallPtrSet<const AllocaInst *, 8> AIVisited;

  for (Value *PtrArg : CB->args()) {

    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());

    if (!Ty)

      continue;


    unsigned AddrSpace = Ty->getAddressSpace();

    if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&

        AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)

      continue;


    const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));

    if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)

      continue;


    AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());

  }

  return AllocaSize;

}


unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {

  unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);


  // Private object passed as arguments may end up in scratch usage if the call

  // is not inlined. Increase the inline threshold to promote inlining.

  unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);

  if (AllocaSize > 0)

    Threshold += ArgAllocaCost;

  return Threshold;

}


unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,

                                         const AllocaInst *AI) const {


  // Below the cutoff, assume that the private memory objects would be

  // optimized

  auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);

  if (AllocaSize <= ArgAllocaCutoff)

    return 0;


  // Above the cutoff, we give a cost to each private memory object

  // depending its size. If the array can be optimized by SROA this cost is not

  // added to the total-cost in the inliner cost analysis.

  //

  // We choose the total cost of the alloca such that their sum cancels the

  // bonus given in the threshold (ArgAllocaCost).

  //

  //   Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost

  //

  // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,

  // the single-bb bonus and the vector-bonus.

  //

  // We compensate the first two multipliers, by repeating logic from the

  // inliner-cost in here. The vector-bonus is 0 on AMDGPU.

  static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");

  unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();


  bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {

    return BB.getTerminator()->getNumSuccessors() > 1;

  });

  if (SingleBB) {

    Threshold += Threshold / 2;

  }


  auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());


  // Attribute the bonus proportionally to the alloca size

  unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;


  return AllocaThresholdBonus;

}


void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

                                         TTI::UnrollingPreferences &UP,

                                         OptimizationRemarkEmitter *ORE) {

  CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);

}


void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,

                                       TTI::PeelingPreferences &PP) {

  CommonTTI.getPeelingPreferences(L, SE, PP);

}


int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {

  return ST->hasFullRate64Ops()

             ? getFullRateInstrCost()

             : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)

                                      : getQuarterRateInstrCost(CostKind);

}


std::pair<InstructionCost, MVT>

GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {

  std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);

  auto Size = DL.getTypeSizeInBits(Ty);

  // Maximum load or store can handle 8 dwords for scalar and 4 for

  // vector ALU. Let's assume anything above 8 dwords is expensive

  // even if legal.

  if (Size <= 256)

    return Cost;


  Cost.first += (Size + 255) / 256;

  return Cost;

}


unsigned GCNTTIImpl::getPrefetchDistance() const {

  return ST->hasPrefetch() ? 128 : 0;

}


bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {

  return AMDGPU::isFlatGlobalAddrSpace(AS);

}

const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

PHI
Rewrite undef for PHI
Definition: AMDGPURewriteUndefForPHI.cpp:100

AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.

UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)

ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))

dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:78

UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)

adjustInliningThresholdUsingCallee
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
Definition: AMDGPUTargetTransformInfo.cpp:1204

ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))

InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))

intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:687

UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)

getCallArgsTotalAllocaSize
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
Definition: AMDGPUTargetTransformInfo.cpp:1250

UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)

UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)

AMDGPUTargetTransformInfo.h
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.

DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: ARMSLSHardening.cpp:73

B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27

CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))

RetTy
return RetTy
Definition: DeadArgumentElimination.cpp:360

LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101

Size
uint64_t Size
Definition: ELFObjHandler.cpp:81

GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171

IRBuilder.h

InlineCost.h

KnownBits.h

RegName
#define RegName(no)

Options
static LVOptions Options
Definition: LVOptions.cpp:25

LoopInfo.h

F
#define F(x, y, z)
Definition: MD5.cpp:55

I
#define I(x, y, z)
Definition: MD5.cpp:58

TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1928

II
uint64_t IntrinsicInst * II
Definition: NVVMIntrRange.cpp:52

TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:48

PatternMatch.h

Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition: RISCVRedundantCopyElimination.cpp:75

assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

SIModeRegisterDefaults.h

Ptr
@ Ptr
Definition: TargetLibraryInfo.cpp:77

ValueTracking.h

PointerType
Definition: ItaniumDemangle.h:612

llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:173

llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition: AMDGPUSubtarget.h:39

llvm::AMDGPUSubtarget::getMaxWorkitemID
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
Definition: AMDGPUSubtarget.cpp:500

llvm::AMDGPUSubtarget::getWavefrontSizeLog2
unsigned getWavefrontSizeLog2() const
Definition: AMDGPUSubtarget.h:225

llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:154

llvm::AMDGPUSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: AMDGPUSubtarget.h:213

llvm::AMDGPUSubtarget::isSingleLaneExecution
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
Definition: AMDGPUSubtarget.cpp:508

llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:185

llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:265

llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:97

llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:103

llvm::AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
Definition: AMDGPUTargetTransformInfo.cpp:270

llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:956

llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:29

llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:61

llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1236

llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:115

llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31

llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41

llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165

llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160

llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:61

llvm::BasicTTIImplBase< AMDGPUTTIImpl >

llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1549

llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
Definition: BasicTTIImpl.h:1279

llvm::BasicTTIImplBase< GCNTTIImpl >::improveShuffleKindFromMask
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975

llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:2565

llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:1023

llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660

llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:1222

llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897

llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:2554

llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861

llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:39

llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:2918

llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236

llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1532

llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465

llvm::CallBase::getCallingConv
CallingConv::ID getCallingConv() const
Definition: InstrTypes.h:1523

llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410

llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1401

llvm::CallBase::getArgOperandNo
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition: InstrTypes.h:1441

llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1398

llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:81

llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850

llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857

llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:161

llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition: DWARFExpression.h:32

llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110

llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410

llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504

llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672

llvm::ElementCount
Definition: TypeSize.h:300

llvm::ElementCount::isScalar
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322

llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2302

llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2354

llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20

llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:41

llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692

llvm::Function
Definition: Function.h:64

llvm::GCNSubtarget
Definition: GCNSubtarget.h:35

llvm::GCNSubtarget::hasPrefetch
bool hasPrefetch() const
Definition: GCNSubtarget.h:940

llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:477

llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278

llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:589

llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:1044

llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:377

llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:344

llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition: GCNSubtarget.h:317

llvm::GCNTTIImpl
Definition: AMDGPUTargetTransformInfo.h:62

llvm::GCNTTIImpl::getRegisterBitWidth
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:321

llvm::GCNTTIImpl::getIntrinsicInstrCost
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:703

llvm::GCNTTIImpl::getVectorInstrCost
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
Definition: AMDGPUTargetTransformInfo.cpp:812

llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:945

llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1335

llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:294

llvm::GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
Definition: AMDGPUTargetTransformInfo.cpp:408

llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:396

llvm::GCNTTIImpl::getMinMaxReductionCost
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:798

llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:844

llvm::GCNTTIImpl::isReadRegisterSourceOfDivergence
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
Definition: AMDGPUTargetTransformInfo.cpp:880

llvm::GCNTTIImpl::getShuffleCost
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:1105

llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(unsigned RCID) const
Definition: AMDGPUTargetTransformInfo.cpp:309

llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:383

llvm::GCNTTIImpl::shouldPrefetchAddressSpace
bool shouldPrefetchAddressSpace(unsigned AS) const override
Definition: AMDGPUTargetTransformInfo.cpp:1365

llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:356

llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:402

llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: AMDGPUTargetTransformInfo.cpp:491

llvm::GCNTTIImpl::getInliningThresholdMultiplier
unsigned getInliningThresholdMultiplier() const
Definition: AMDGPUTargetTransformInfo.h:243

llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: AMDGPUTargetTransformInfo.cpp:1329

llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:333

llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:500

llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:345

llvm::GCNTTIImpl::getPrefetchDistance
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Definition: AMDGPUTargetTransformInfo.cpp:1361

llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:1033

llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:337

llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
Definition: AMDGPUTargetTransformInfo.cpp:450

llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:1016

llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1277

llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1165

llvm::GCNTTIImpl::getCallerAllocaCost
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: AMDGPUTargetTransformInfo.cpp:1288

llvm::GCNTTIImpl::getCFInstrCost
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:749

llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:366

llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:903

llvm::GCNTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:526

llvm::GCNTTIImpl::hasBranchDivergence
bool hasBranchDivergence(const Function *F=nullptr) const
Definition: AMDGPUTargetTransformInfo.cpp:305

llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicElementSize) const
Definition: AMDGPUTargetTransformInfo.cpp:419

llvm::GCNTTIImpl::getArithmeticReductionCost
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:780

llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:85

llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915

llvm::IRBuilder
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671

llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:97

llvm::InstructionCost
Definition: InstructionCost.h:29

llvm::InstructionCost::getValue
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
Definition: InstructionCost.h:87

llvm::Instruction
Definition: Instruction.h:68

llvm::Instruction::hasApproxFunc
bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
Definition: Instruction.cpp:605

llvm::Instruction::hasAllowContract
bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:600

llvm::Instruction::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74

llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:121

llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:154

llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:152

llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48

llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67

llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174

llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44

llvm::MDNode
Metadata node.
Definition: Metadata.h:1067

llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:34

llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:36

llvm::MVT::getVT
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230

llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62

llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65

llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:34

llvm::PHINode
Definition: Instructions.h:2505

llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:145

llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:32

llvm::SITargetLowering
Definition: SIISelLowering.h:31

llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:15361

llvm::SITargetLowering::getNumRegistersForCallingConv
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
Definition: SIISelLowering.cpp:1033

llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:452

llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344

llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479

llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586

llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition: SmallVector.h:426

llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209

llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50

llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1756

llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1659

llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:362

llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4986

llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:5684

llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:6037

llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77

llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:118

llvm::TargetOptions
Definition: TargetOptions.h:135

llvm::TargetOptions::UnsafeFPMath
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
Definition: TargetOptions.h:178

llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45

llvm::TargetTransformInfoImplBase::getDataLayout
const DataLayout & getDataLayout() const
Definition: TargetTransformInfoImpl.h:48

llvm::TargetTransformInfoImplBase::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
Definition: TargetTransformInfoImpl.h:848

llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:259

llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:262

llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:263

llvm::TargetTransformInfo::requiresOrderedReduction
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
Definition: TargetTransformInfo.h:1483

llvm::TargetTransformInfo::RegisterKind
RegisterKind
Definition: TargetTransformInfo.h:1143

llvm::TargetTransformInfo::RGK_FixedWidthVector
@ RGK_FixedWidthVector
Definition: TargetTransformInfo.h:1143

llvm::TargetTransformInfo::RGK_ScalableVector
@ RGK_ScalableVector
Definition: TargetTransformInfo.h:1143

llvm::TargetTransformInfo::RGK_Scalar
@ RGK_Scalar
Definition: TargetTransformInfo.h:1143

llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:285

llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:1061

llvm::TargetTransformInfo::SK_InsertSubvector
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
Definition: TargetTransformInfo.h:1068

llvm::TargetTransformInfo::SK_Select
@ SK_Select
Selects elements from the corresponding lane of either source operand.
Definition: TargetTransformInfo.h:1064

llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:1072

llvm::TargetTransformInfo::SK_Splice
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
Definition: TargetTransformInfo.h:1074

llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:1062

llvm::TargetTransformInfo::SK_PermuteTwoSrc
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
Definition: TargetTransformInfo.h:1070

llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:1063

llvm::TargetTransformInfo::SK_ExtractSubvector
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
Definition: TargetTransformInfo.h:1069

llvm::TypeSize
Definition: TypeSize.h:334

llvm::TypeSize::getFixed
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345

llvm::TypeSize::getScalable
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348

llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45

llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.

llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)

llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302

llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)

llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129

llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)

llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)

llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)

llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43

llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169

llvm::Value
LLVM Value Representation.
Definition: Value.h:74

llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255

llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:397

llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434

llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075

llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403

llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:436

llvm::cl::opt
Definition: CommandLine.h:1423

uint64_t

unsigned

Analysis.h

llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143

llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPUAddrSpace.h:38

llvm::AMDGPUAS::BUFFER_STRIDED_POINTER
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
Definition: AMDGPUAddrSpace.h:45

llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPUAddrSpace.h:32

llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPUAddrSpace.h:35

llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPUAddrSpace.h:34

llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPUAddrSpace.h:30

llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPUAddrSpace.h:31

llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPUAddrSpace.h:40

llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPUAddrSpace.h:36

llvm::AMDGPUAS::BUFFER_RESOURCE
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition: AMDGPUAddrSpace.h:43

llvm::AMDGPU::isFlatGlobalAddrSpace
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415

llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:2767

llvm::AMDGPU::isIntrinsicAlwaysUniform
bool isIntrinsicAlwaysUniform(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:2932

llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:2928

llvm::AMDGPU::isExtendedGlobalAddrSpace
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422

llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34

llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37

llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246

llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:398

llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397

llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:736

llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:735

llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:960

llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:709

llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734

llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:710

llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:399

llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:247

llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:400

llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:401

llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708

llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:248

llvm::InlineConstants::getInstrCost
int getInstrCost()
Definition: InlineCost.cpp:195

llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513

llvm::PatternMatch
Definition: PatternMatch.h:47

llvm::PatternMatch::m_AShr
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1236

llvm::PatternMatch::m_c_And
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
Definition: PatternMatch.h:2711

llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49

llvm::PatternMatch::m_ConstantInt
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168

llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:921

llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92

llvm::PatternMatch::m_LShr
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1230

llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:137

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443

llvm::dwarf::Index
Index
Definition: Dwarf.h:875

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18

llvm::Length
@ Length
Definition: DWP.cpp:480

llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:36

llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
Definition: ValueTracking.cpp:6535

llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1055

llvm::MinAlign
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition: MathExtras.h:366

llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729

llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163

llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736

llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56

llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent

llvm::RecurKind::FAdd
@ FAdd
Sum of floats.

llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155

llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:165

llvm::ComputeValueVTs
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79

llvm::count_if
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921

llvm::Cost
InstructionCost Cost
Definition: FunctionSpecialization.h:95

llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39

llvm::DenormalMode::getPreserveSign
static constexpr DenormalMode getPreserveSign()
Definition: FloatingPointMode.h:118

llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:34

llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370

llvm::KnownBits
Definition: KnownBits.h:23

llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:240

llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:73

llvm::SIModeRegisterDefaults
Definition: SIModeRegisterDefaults.h:20

llvm::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: SIModeRegisterDefaults.h:85

llvm::TargetTransformInfo::OperandValueInfo
Definition: TargetTransformInfo.h:1098

llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:646

llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:531

llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:572

llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:539

llvm::TargetTransformInfo::UnrollingPreferences::UnrollVectorizedLoop
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Definition: TargetTransformInfo.h:616

llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:614

llvm::TargetTransformInfo::UnrollingPreferences::BEInsns
unsigned BEInsns
Definition: TargetTransformInfo.h:585

llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:556

llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:592

llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:588

llvm::cl::desc
Definition: CommandLine.h:409