Go to the documentation of this file.
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #define DEBUG_TYPE "amdgpu-simplifylib"
30 cl::desc(
"Enable pre-link mode optimizations"),
35 cl::desc(
"Comma separated list of functions to replace with native, or all"),
39 #define MATH_PI numbers::pi
40 #define MATH_E numbers::e
41 #define MATH_SQRT2 numbers::sqrt2
42 #define MATH_SQRT1_2 numbers::inv_sqrt2
54 bool AllNative =
false;
87 bool evaluateScalarMathFunc(
const FuncInfo &FInfo,
double& Res0,
176 "Simplify well-known AMD library calls",
false,
false)
182 "Replace builtin math
calls with
that native versions.",
185 template <typename IRB>
190 R->setCallingConv(
F->getCallingConv());
194 template <
typename IRB>
199 R->setCallingConv(
F->getCallingConv());
449 bool AMDGPULibCalls::parseFunctionName(
const StringRef &FMangledName,
455 if (
auto Op = dyn_cast<FPMathOperator>(
CI))
459 Attribute Attr =
F->getFnAttribute(
"unsafe-fp-math");
463 bool AMDGPULibCalls::useNativeFunc(
const StringRef F)
const {
468 AllNative = useNativeFunc(
"all") ||
473 bool AMDGPULibCalls::sincosUseNative(
CallInst *aCI,
const FuncInfo &FInfo) {
474 bool native_sin = useNativeFunc(
"sin");
475 bool native_cos = useNativeFunc(
"cos");
477 if (native_sin && native_cos) {
492 if (sinExpr && cosExpr) {
498 <<
" with native version of sin/cos");
512 if (!parseFunctionName(
Callee->getName(), FInfo) || !FInfo.
isMangled() ||
515 !(AllNative || useNativeFunc(FInfo.
getName()))) {
520 return sincosUseNative(aCI, FInfo);
529 <<
" with native version");
541 const FuncInfo &FInfo) {
543 if (!
Callee->isDeclaration())
546 assert(
Callee->hasName() &&
"Invalid read_pipe/write_pipe function");
548 auto &Ctx =
M->getContext();
549 std::string
Name = std::string(
Callee->getName());
551 if (NumArg != 4 && NumArg != 6)
555 if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
557 unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
558 Align Alignment = cast<ConstantInt>(PacketAlign)->getAlignValue();
559 if (Alignment != Size)
573 for (
unsigned I = 0;
I != PtrArgLoc; ++
I)
575 ArgTys.push_back(PtrTy);
585 auto *BCast =
B.CreatePointerCast(PtrArg, PtrTy);
587 for (
unsigned I = 0;
I != PtrArgLoc; ++
I)
589 Args.push_back(BCast);
591 auto *NCI =
B.CreateCall(
F,
Args);
618 B.setFastMathFlags(FPOp->getFastMathFlags());
620 switch (
Callee->getIntrinsicID()) {
623 case Intrinsic::amdgcn_wavefrontsize:
628 if (!parseFunctionName(
Callee->getName(), FInfo))
635 if (TDOFold(
CI, FInfo))
645 switch (FInfo.
getId()) {
650 "recip must be an either native or half function");
657 "divide must be an either native or half function");
663 return fold_pow(
CI,
B, FInfo);
682 return fold_sincos(
CI,
B,
AA);
689 return fold_read_write_pipe(
CI,
B, FInfo);
698 bool AMDGPULibCalls::TDOFold(
CallInst *CI,
const FuncInfo &FInfo) {
705 const TableEntry *
const ftbl = tr.
table;
711 for (
int eltNo = 0; eltNo <
getVecSize(FInfo); ++eltNo) {
713 CV->getElementAsConstant((
unsigned)eltNo));
714 assert(eltval &&
"Non-FP arguments in math function!");
716 for (
int i=0;
i < sz; ++
i) {
718 DVal.push_back(ftbl[
i].
result);
732 for (
unsigned i = 0;
i < DVal.size(); ++
i) {
733 FVal.push_back((
float)DVal[
i]);
747 if (
ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
748 for (
int i = 0;
i < sz; ++
i) {
749 if (CF->isExactlyValue(ftbl[
i].input)) {
764 const FuncInfo &FInfo) {
766 if (
ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
782 const FuncInfo &FInfo) {
793 opr1,
"__div2recip");
794 Value *nval =
B.CreateFMul(opr0, nval1,
"__div2mul");
803 #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
812 const FuncInfo &FInfo) {
816 "fold_pow: encounter a wrong function call");
826 CZero = dyn_cast<ConstantAggregateZero>(opr1);
829 CF = dyn_cast<ConstantFP>(opr1);
830 CINT = dyn_cast<ConstantInt>(opr1);
833 assert(VTy &&
"Oprand of vector function should be of vectortype");
838 CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->
getSplatValue()) : nullptr;
839 CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->
getSplatValue()) : nullptr;
849 if ((CF && CF->
isZero()) || (CINT && ci_opr1 == 0) || CZero) {
867 LLVM_DEBUG(
errs() <<
"AMDIC: " << *
CI <<
" ---> " << *opr0 <<
" * " << *opr0
869 Value *nval =
B.CreateFMul(opr0, opr0,
"__pow2");
873 if ((CF && CF->
isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
880 Value *nval =
B.CreateFDiv(cnval, opr0,
"__powrecip");
894 << FInfo.
getName().c_str() <<
"(" << *opr0 <<
")\n");
895 Value *nval = CreateCallEx(
B,FPExpr, opr0, issqrt ?
"__pow2sqrt"
912 int ival = (
int)dval;
913 if ((
double)ival == dval) {
916 ci_opr1 = 0x11111111;
921 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
922 if (abs_opr1 <= 12) {
932 Value *valx2 =
nullptr;
934 while (abs_opr1 > 0) {
935 valx2 = valx2 ?
B.CreateFMul(valx2, valx2,
"__powx2") : opr0;
937 nval = nval ?
B.CreateFMul(nval, valx2,
"__powprod") : valx2;
948 nval =
B.CreateFDiv(cnval, nval,
"__1powprod");
951 << ((ci_opr1 < 0) ?
"1/prod(" :
"prod(") << *opr0
964 bool needlog =
false;
965 bool needabs =
false;
966 bool needcopysign =
false;
969 CF = dyn_cast<ConstantFP>(opr0);
993 "Wrong vector size detected");
1000 if (V < 0.0) needcopysign =
true;
1006 for (
unsigned i=0;
i < DVal.size(); ++
i) {
1007 FVal.push_back((
float)DVal[
i]);
1022 if (
const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) {
1026 if (
y != (
double)(int64_t)
y)
1036 if (
y != (
double)(int64_t)
y)
1050 nval = CreateCallEx(
B, AbsExpr, opr0,
"__fabs");
1052 nval = cnval ? cnval : opr0;
1059 nval = CreateCallEx(
B,LogExpr, nval,
"__log2");
1064 opr1 =
B.CreateSIToFP(opr1, nval->
getType(),
"pownI2F");
1066 nval =
B.CreateFMul(opr1, nval,
"__ylogx");
1067 nval = CreateCallEx(
B,ExpExpr, nval,
"__exp2");
1074 if (
const auto *vTy = dyn_cast<FixedVectorType>(rTy))
1079 opr_n =
B.CreateZExtOrBitCast(opr_n, nTy,
"__ytou");
1081 opr_n =
B.CreateFPToSI(opr1, nTy,
"__ytou");
1083 Value *sign =
B.CreateShl(opr_n,
size-1,
"__yeven");
1084 sign =
B.CreateAnd(
B.CreateBitCast(opr0, nTy), sign,
"__pow_sign");
1085 nval =
B.CreateOr(
B.CreateBitCast(nval, nTy), sign);
1086 nval =
B.CreateBitCast(nval, opr0->
getType());
1090 <<
"exp2(" << *opr1 <<
" * log2(" << *opr0 <<
"))\n");
1097 const FuncInfo &FInfo) {
1116 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__rootn2sqrt");
1120 }
else if (ci_opr1 == 3) {
1125 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__rootn2cbrt");
1129 }
else if (ci_opr1 == -1) {
1136 }
else if (ci_opr1 == -2) {
1142 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__rootn2rsqrt");
1151 const FuncInfo &FInfo) {
1156 ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
1157 ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
1158 if ((CF0 && CF0->
isZero()) || (CF1 && CF1->
isZero())) {
1166 LLVM_DEBUG(
errs() <<
"AMDIC: " << *
CI <<
" ---> " << *opr1 <<
" + " << *opr2
1168 Value *nval =
B.CreateFAdd(opr1, opr2,
"fmaadd");
1174 LLVM_DEBUG(
errs() <<
"AMDIC: " << *
CI <<
" ---> " << *opr0 <<
" + " << *opr2
1176 Value *nval =
B.CreateFAdd(opr0, opr2,
"fmaadd");
1180 if (
ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
1185 Value *nval =
B.CreateFMul(opr0, opr1,
"fmamul");
1196 const FuncInfo &FInfo) {
1199 FuncInfo nf = FInfo;
1201 return getFunction(M, nf);
1206 const FuncInfo &FInfo) {
1213 <<
"sqrt(" << *opr0 <<
")\n");
1214 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__sqrt");
1236 int const MaxScan = 30;
1237 bool Changed =
false;
1240 LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
1256 std::string
const PairName = fInfo.
mangle();
1260 CallInst *XI = dyn_cast_or_null<CallInst>(U);
1272 for (
int I = MaxScan;
I > 0 && BBI != CBB->
begin(); --BBI, --
I) {
1273 if (cast<Instruction>(BBI) == XI) {
1295 AllocaInst *Alloc = insertAlloca(UI,
B,
"__sincos_");
1296 B.SetInsertPoint(UI);
1304 P =
B.CreateAddrSpaceCast(Alloc, PTy);
1307 LLVM_DEBUG(
errs() <<
"AMDIC: fold_sincos (" << *
CI <<
", " << *UI <<
") with "
1311 B.SetInsertPoint(&*ItOld);
1313 Instruction *Reload =
B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1318 Instruction *Reload =
B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1339 unsigned N =
ST.getWavefrontSize();
1353 assert(
BB &&
"Entry block not found!");
1360 const char *prefix) {
1364 B.SetInsertPoint(&*ItNew);
1366 B.CreateAlloca(RetType,
nullptr, std::string(prefix) + UI->
getName());
1367 Alloc->setAlignment(
1372 bool AMDGPULibCalls::evaluateScalarMathFunc(
const FuncInfo &FInfo,
1373 double& Res0,
double& Res1,
1379 double opr0=0.0, opr1=0.0, opr2=0.0;
1380 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
1381 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
1382 ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2);
1401 switch (FInfo.getId()) {
1402 default :
return false;
1410 Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0));
1423 Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0));
1436 Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0;
1444 Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0);
1464 Res0 = pow(2.0, opr0);
1468 Res0 = pow(10.0, opr0);
1472 Res0 = exp(opr0) - 1.0;
1480 Res0 = log(opr0) / log(2.0);
1484 Res0 = log(opr0) / log(10.0);
1488 Res0 = 1.0 / sqrt(opr0);
1530 Res0 = pow(opr0, opr1);
1534 if (
ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
1535 double val = (
double)iopr1->getSExtValue();
1536 Res0 = pow(opr0,
val);
1543 if (
ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
1544 double val = (
double)iopr1->getSExtValue();
1545 Res0 = pow(opr0, 1.0 /
val);
1560 Res0 = opr0 * opr1 + opr2;
1567 bool AMDGPULibCalls::evaluateCall(
CallInst *aCI,
const FuncInfo &FInfo) {
1576 if ((copr0 = dyn_cast<Constant>(aCI->
getArgOperand(0))) ==
nullptr)
1581 if ((copr1 = dyn_cast<Constant>(aCI->
getArgOperand(1))) ==
nullptr) {
1588 if ((copr2 = dyn_cast<Constant>(aCI->
getArgOperand(2))) ==
nullptr)
1595 double DVal0[16], DVal1[16];
1598 if (FuncVecSize == 1) {
1599 if (!evaluateScalarMathFunc(FInfo, DVal0[0],
1600 DVal1[0], copr0, copr1, copr2)) {
1607 for (
int i = 0;
i < FuncVecSize; ++
i) {
1611 if (!evaluateScalarMathFunc(FInfo, DVal0[
i],
1612 DVal1[
i], celt0, celt1, celt2)) {
1620 if (FuncVecSize == 1) {
1627 for (
int i = 0;
i < FuncVecSize; ++
i)
1628 FVal0.push_back((
float)DVal0[
i]);
1631 if (hasTwoResults) {
1632 for (
int i = 0;
i < FuncVecSize; ++
i)
1633 FVal1.push_back((
float)DVal1[
i]);
1640 if (hasTwoResults) {
1647 if (hasTwoResults) {
1650 "math function with ptr arg not supported yet");
1660 return new AMDGPUSimplifyLibCalls(
TM);
1664 return new AMDGPUUseNativeCalls();
1668 if (skipFunction(
F))
1671 bool Changed =
false;
1672 auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1675 F.printAsOperand(
dbgs(),
false,
F.getParent());
dbgs() <<
'\n';);
1677 for (
auto &
BB :
F) {
1688 if (Callee ==
nullptr)
1693 if(Simplifier.
fold(CI,
AA))
1705 bool Changed =
false;
1709 F.printAsOperand(
dbgs(),
false,
F.getParent());
dbgs() <<
'\n';);
1711 for (
auto &
BB :
F) {
1727 if (Simplifier.
fold(CI,
AA))
1738 bool Changed =
false;
1739 for (
auto &
BB :
F) {
1748 if (Callee ==
nullptr)
1766 bool Changed =
false;
1767 for (
auto &
BB :
F) {
A set of analyses that are preserved following a run of a transformation pass.
A manager for alias analyses.
float convertToFloat() const
Converts this APFloat to host float value.
AMDGPULibCalls(const TargetMachine *TM_=nullptr)
static TableRef getOptTable(AMDGPULibFunc::EFuncId id)
static const TableEntry tbl_asin[]
static const TableEntry tbl_sinpi[]
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
static const TableEntry tbl_tan[]
InstListType::iterator iterator
Instruction iterators...
const Function * getParent() const
Return the enclosing method, or null if none.
static const TableEntry tbl_acos[]
static const TableEntry tbl_log2[]
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
into xmm2 addss xmm2 xmm1 xmm3 addss xmm3 movaps xmm0 unpcklps xmm0 ret seems silly when it could just be one addps Expand libm rounding functions main should enable SSE DAZ mode and other fast SSE modes Think about doing i64 math in SSE regs on x86 This testcase should have no SSE instructions in and only one load from a constant double
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static const TableEntry tbl_atanh[]
void dropAllReferences()
Drop all references to operands.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static const TableEntry tbl_log[]
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
static const TableEntry tbl_log10[]
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
float getElementAsFloat(unsigned i) const
If this is an sequential container of floats, return the specified element as a float.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr)
bool isZero() const
Return true if the value is positive or negative zero.
The instances of the Type class are immutable: once they are created, they are never changed.
AttributeList getAttributes() const
Return the parameter attributes for this call.
All zero aggregate value.
const APFloat & getValueAPF() const
static const TableEntry tbl_cbrt[]
Type * getElementType() const
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
urem i32 %X, 255 ret i32 %tmp1 } Currently it compiles to:... movl $2155905153, %ecx movl 8(%esp), %esi movl %esi, %eax mull %ecx ... This could be "reassociated" into:movl $2155905153, %eax movl 8(%esp), %ecx mull %ecx to avoid the copy. In fact, the existing two-address stuff would do this except that mul isn 't a commutative 2-addr instruction. I guess this has to be done at isel time based on the #uses to mul? Make sure the instruction which starts a loop does not cross a cacheline boundary. This requires knowning the exact length of each machine instruction. That is somewhat complicated, but doable. Example 256.bzip2:In the new trace, the hot loop has an instruction which crosses a cacheline boundary. In addition to potential cache misses, this can 't help decoding as I imagine there has to be some kind of complicated decoder reset and realignment to grab the bytes from the next cacheline. 532 532 0x3cfc movb(1809(%esp, %esi), %bl<<<--- spans 2 64 byte lines 942 942 0x3d03 movl %dh,(1809(%esp, %esi) 937 937 0x3d0a incl %esi 3 3 0x3d0b cmpb %bl, %dl 27 27 0x3d0d jnz 0x000062db< main+11707 > In c99 mode, the preprocessor doesn 't like assembly comments like #TRUNCATE. This could be a single 16-bit load. int f(char *p) { if((p[0]==1) &(p[1]==2)) return 1 tmp1
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local $pop6 WebAssembly registers are implicitly initialized to zero Explicit zeroing is therefore often redundant and could be optimized away Small indices may use smaller encodings than large indices WebAssemblyRegColoring and or WebAssemblyRegRenumbering should sort registers according to their usage frequency to maximize the usage of smaller encodings Many cases of irreducible control flow could be transformed more optimally than via the transform in WebAssemblyFixIrreducibleControlFlow cpp It may also be worthwhile to do transforms before register particularly when duplicating to allow register coloring to be aware of the duplication WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more aggressively WebAssemblyRegStackify is currently a greedy algorithm This means that
alloca< 16 x float >, align 16 %tmp2=alloca< 16 x float >, align 16 store< 16 x float > %A,< 16 x float > *%tmp %s=bitcast< 16 x float > *%tmp to i8 *%s2=bitcast< 16 x float > *%tmp2 to i8 *call void @llvm.memcpy.i64(i8 *%s, i8 *%s2, i64 64, i32 16) %R=load< 16 x float > *%tmp2 ret< 16 x float > %R } declare void @llvm.memcpy.i64(i8 *nocapture, i8 *nocapture, i64, i32) nounwind which compiles to:_foo:subl $140, %esp movaps %xmm3, 112(%esp) movaps %xmm2, 96(%esp) movaps %xmm1, 80(%esp) movaps %xmm0, 64(%esp) movl 60(%esp), %eax movl %eax, 124(%esp) movl 56(%esp), %eax movl %eax, 120(%esp) movl 52(%esp), %eax< many many more 32-bit copies > movaps(%esp), %xmm0 movaps 16(%esp), %xmm1 movaps 32(%esp), %xmm2 movaps 48(%esp), %xmm3 addl $140, %esp ret On Nehalem, it may even be cheaper to just use movups when unaligned than to fall back to lower-granularity chunks. Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations:1. ix86_pad_returns inserts a noop before ret instructions if immediately preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. The first one is done for all AMDs, Core2, and "Generic" The second one is done for:Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" Testcase:int x(int a) { return(a &0xf0)> >4 tmp
bool getValueAsBool() const
Return the attribute's value as a boolean.
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &)
static const TableEntry tbl_erf[]
Itanium Name Demangler i e convert the string _Z1fv into and both[sub] projects need to demangle but neither can depend on each other *libcxxabi needs the demangler to implement which is part of the itanium ABI spec *LLVM needs a copy for a bunch of and cannot rely on the system s __cxa_demangle because it a might not be and b may not be up to date on the latest language features The copy of the demangler in LLVM has some extra stuff that aren t needed in which depend on the shared generic components Despite these we want to keep the core generic demangling library identical between both copies to simplify development and testing If you re working on the generic library
LLVM Basic Block Representation.
It looks like we only need to define PPCfmarto for these because according to these instructions perform RTO on fma s result
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void replaceCall(Value *With)
Param * getLeads()
Get leading parameters for mangled lib functions.
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
static unsigned getEPtrKindFromAddrSpace(unsigned AS)
Wrapper class for AMDGPULIbFuncImpl.
static const TableEntry tbl_acospi[]
This is the shared class of boolean and integer constants.
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
FunctionPass * createAMDGPUUseNativeCallsPass()
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static const TableEntry tbl_exp[]
bool useNative(CallInst *CI)
static FunctionCallee getOrInsertFunction(llvm::Module *M, const AMDGPULibFunc &fInfo)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
iterator begin()
Instruction iterator methods.
void initializeAMDGPUUseNativeCallsPass(PassRegistry &)
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Represent the analysis usage information of a pass.
static const TableEntry tbl_asinpi[]
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
double convertToDouble() const
Converts this APFloat to host double value.
Constant * getSplatValue() const
If this is a splat constant, meaning that all of the elements have the same value,...
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
ConstantFP - Floating Point Values [float, double].
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
FunctionType * getFunctionType()
static int getVecSize(const AMDGPULibFunc &FInfo)
static const TableEntry tbl_expm1[]
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 input
This struct is a compact representation of a valid (non-zero power of two) alignment.
static const TableEntry tbl_atan[]
bool isLifetimeStartOrEnd() const
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
amdgpu Simplify well known AMD library false FunctionCallee Value const Twine & Name
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
static bool HasNative(AMDGPULibFunc::EFuncId id)
static const TableEntry tbl_tanh[]
LLVM_NODISCARD bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Base class of all SIMD vector types.
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 the input will be treated as an leaving the upper bits uninitialised For i64 store i32 val
An instruction for storing to memory.
This is an important base class in LLVM.
bool isUnsafeMath(const CallInst *CI) const
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_NODISCARD bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Type * getReturnType() const
Returns the type of the ret val.
constexpr LLVM_NODISCARD bool empty() const
empty - Check if the string is empty.
static const TableEntry tbl_cospi[]
Module * getParent()
Get the module that this global value is contained inside of...
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
This is an important class for using LLVM in a threaded context.
std::string mangle() const
A vector constant whose element type is a simple 1/2/4/8-byte integer or float/double,...
Type * getParamType(unsigned i) const
Parameter type accessors.
static AMDGPULibFunc::EType getArgType(const AMDGPULibFunc &FInfo)
initializer< Ty > init(const Ty &Val)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
static const TableEntry tbl_asinh[]
amdgpu Simplify well known AMD library false
unsigned getNumArgs() const
bool isNegative() const
Return true if the sign bit is set.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Utility class for floating point operations which can have information about relaxed accuracy require...
Primary interface to the complete machine description for the target machine.
Constant * getElementAsConstant(unsigned i) const
Return a Constant for a specified index's element.
A Module instance is used to store all the information related to an LLVM module.
static const TableEntry tbl_cos[]
@ PRIVATE_ADDRESS
Address space for private memory.
LLVM_NODISCARD bool contains_insensitive(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
StringRef - Represent a constant reference to a string, i.e.
static const TableEntry tbl_erfc[]
unsigned getNumUses() const
This method computes the number of uses of this Value.
Type * getType() const
All values are typed, get the type of this value.
static const TableEntry tbl_atanpi[]
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
static const TableEntry tbl_tanpi[]
self_iterator getIterator()
StringRef getTargetFeatureString() const
void setPrefix(ENamePrefix PFX)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
StringRef getName() const
Return a constant reference to the value's name.
An instruction for reading from memory.
static cl::list< std::string > UseNative("amdgpu-use-native", cl::desc("Comma separated list of functions to replace with native, or all"), cl::CommaSeparated, cl::ValueOptional, cl::Hidden)
llvm ldr ldrb ldrh str strh strb strb gcc and possibly speed as well(we don 't have a good way to measure on ARM). *Consider this silly example
amdgpu Simplify well known AMD library false FunctionCallee Callee
LLVMContext & getContext() const
Get the context in which this basic block lives.
static bool runOnFunction(Function &F, bool PostInlining)
static cl::opt< bool > EnablePreLink("amdgpu-prelink", cl::desc("Enable pre-link mode optimizations"), cl::init(false), cl::Hidden)
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static IntegerType * getInt64Ty(LLVMContext &C)
INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib", "Simplify well-known AMD library calls", false, false) INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
double getElementAsDouble(unsigned i) const
If this is an sequential container of doubles, return the specified element as a double.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
unsigned arg_size() const
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
TableRef(const TableEntry(&tbl)[N])
ENamePrefix getPrefix() const
Value * FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan=DefMaxInstsToScan, AAResults *AA=nullptr, bool *IsLoadCSE=nullptr, unsigned *NumScanedInst=nullptr)
Scan backwards to see if we have the value of the given load available locally within a small number ...
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
into llvm powi allowing the code generator to produce balanced multiplication trees the intrinsic needs to be extended to support and second the code generator needs to be enhanced to lower these to multiplication trees Interesting testcase for add shift mul int y
static double log2(double V)
static const TableEntry tbl_rsqrt[]
StringRef getTargetCPU() const
static const TableEntry tbl_exp10[]
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
bool fold(CallInst *CI, AliasAnalysis *AA=nullptr)
static const TableEntry tbl_sin[]
Value * getArgOperand(unsigned i) const
static Constant * getSplat(unsigned NumElts, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static const TableEntry tbl_exp2[]
@ FLAT_ADDRESS
Address space for flat memory.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
std::string getName() const
Get unmangled name for mangled library function and name for unmangled library function.
const BasicBlock * getParent() const
std::string to_string(const T &Value)
static const TableEntry tbl_sinh[]
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
amdgpu Simplify well known AMD library calls
static const TableEntry tbl_cosh[]
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
A container for analyses that lazily runs them and caches their results.
const char LLVMTargetMachineRef TM
FunctionPass class - This class is used to implement most global optimizations.
This class represents a function call, abstracting a target machine's calling convention.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
static const TableEntry tbl_sqrt[]
AnalysisUsage & addRequired()
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
an instruction to allocate memory on the stack
APFloat abs(APFloat X)
Returns the absolute value of the argument.
LLVM Value Representation.
iterator_range< user_iterator > users()
static const TableEntry tbl_acosh[]
static const TableEntry tbl_tgamma[]
FunctionPass * createAMDGPUSimplifyLibCallsPass(const TargetMachine *)
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
unsigned getNumElements() const
Return the number of elements in the array or vector.