30#include "llvm/IR/IntrinsicsAMDGPU.h"
41#define DEBUG_TYPE "amdgpu-codegenprepare"
49 "amdgpu-codegenprepare-widen-constant-loads",
50 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
56 cl::desc(
"Break large PHI nodes for DAGISel"),
60 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc(
"For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
71 "amdgpu-codegenprepare-mul24",
72 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98class AMDGPUCodeGenPrepareImpl
99 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
108 const bool HasFP32DenormalFlush;
109 bool FlowChanged =
false;
110 mutable Function *SqrtF32 =
nullptr;
111 mutable Function *LdexpF32 =
nullptr;
120 DL(
F.getDataLayout()), SQ(
DL, TLI, DT, AC),
130 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
140 F.getParent(), Intrinsic::ldexp,
141 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
145 bool canBreakPHINode(
const PHINode &
I);
148 bool isLegalFloatingTy(
const Type *
T)
const;
157 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
158 return HasFP32DenormalFlush ||
183 unsigned MaxDivBits,
bool Signed)
const;
189 bool IsSigned)
const;
193 bool IsDiv,
bool IsSigned)
const;
211 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
226 float ReqdAccuracy)
const;
231 float ReqdAccuracy)
const;
233 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
237 bool IsNegative)
const;
244 bool IsNegative)
const;
248 void replaceWithMaskedWorkitemIdX(
Instruction &
I,
unsigned WaveSize)
const;
249 bool tryReplaceWithWorkitemId(
Instruction &
I,
unsigned Wave)
const;
284 if (!ExpandDiv64InIR)
288 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
293bool AMDGPUCodeGenPrepareImpl::run() {
294 BreakPhiNodesCache.clear();
295 bool MadeChange =
false;
307 while (!DeadVals.empty()) {
315bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
317 (Ty->
isHalfTy() && ST.has16BitInsts());
320bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &
I)
const {
321 Type *Ty =
I.getType();
322 int TySize =
DL.getTypeSizeInBits(Ty);
323 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
325 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.
isUniformAtDef(&
I);
329AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op,
330 const Instruction *CtxI)
const {
335AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op,
336 const Instruction *CtxI)
const {
348 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
349 Values.
push_back(Builder.CreateExtractElement(V,
I));
355 if (!Ty->isVectorTy()) {
361 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
362 NewVal = Builder.CreateInsertElement(NewVal, Values[
I],
I);
367bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &
I)
const {
368 if (
I.getOpcode() != Instruction::Mul)
371 Type *Ty =
I.getType();
373 if (
Size <= 16 && ST.has16BitInsts())
383 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
385 unsigned LHSBits = 0, RHSBits = 0;
386 bool IsSigned =
false;
388 if (ST.
hasMulU24() && (LHSBits = numBitsUnsigned(
LHS, &
I)) <= 24 &&
389 (RHSBits = numBitsUnsigned(
RHS, &
I)) <= 24) {
392 }
else if (ST.
hasMulI24() && (LHSBits = numBitsSigned(
LHS, &
I)) <= 24 &&
393 (RHSBits = numBitsSigned(
RHS, &
I)) <= 24) {
399 SmallVector<Value *, 4> LHSVals;
400 SmallVector<Value *, 4> RHSVals;
401 SmallVector<Value *, 4> ResultVals;
405 IntegerType *I32Ty = Builder.getInt32Ty();
406 IntegerType *IntrinTy =
Size > 32 ? Builder.getInt64Ty() : I32Ty;
407 Type *DstTy = LHSVals[0]->getType();
409 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
410 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
411 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
412 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
413 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
415 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
417 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
418 : Builder.CreateZExtOrTrunc(Result, DstTy);
424 I.replaceAllUsesWith(NewVal);
425 DeadVals.push_back(&
I);
445bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO)
const {
466 if (!CBO || !CT || !CF)
493 Builder.setFastMathFlags(FPOp->getFastMathFlags());
499 DeadVals.push_back(&BO);
501 DeadVals.push_back(CastOp);
502 DeadVals.push_back(Sel);
506std::pair<Value *, Value *>
507AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
509 Type *Ty = Src->getType();
522 : Builder.CreateExtractValue(Frexp, {1});
523 return {FrexpMant, FrexpExp};
529 bool IsNegative)
const {
544 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
547 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
553 FastMathFlags FMF)
const {
557 if (HasFP32DenormalFlush && ST.
hasFractBug() && !ST.hasFastFMAF32() &&
563 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder,
RHS);
568 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder,
LHS);
580 FastMathFlags FMF)
const {
581 Type *Ty = Src->getType();
585 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
588 Value *InputScaleFactor =
595 Value *OutputScaleFactor =
597 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
608 Type *Ty = Src->getType();
612 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
613 Constant *One = ConstantFP::get(Ty, 1.0);
614 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
616 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
618 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
620 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
621 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
622 Value *OutputScaleFactor = Builder.CreateSelect(
623 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
625 return Builder.CreateFMul(Rsq, OutputScaleFactor);
631 FastMathFlags SqrtFMF,
632 FastMathFlags DivFMF,
633 const Instruction *CtxI,
634 bool IsNegative)
const {
656 bool MaybePosInf = !SqrtFMF.
noInfs() && !DivFMF.
noInfs();
657 bool MaybeZero = !DivFMF.
noInfs();
659 DenormalMode DenormMode;
666 if (Interested !=
fcNone) {
671 DenormMode =
F.getDenormalMode(
X->getType()->getFltSemantics());
677 if (MaybeZero || MaybePosInf) {
679 if (MaybePosInf && MaybeZero) {
680 if (DenormMode.
Input != DenormalMode::DenormalModeKind::Dynamic) {
695 }
else if (MaybeZero) {
708 Value *
E = Builder.
CreateFMA(NegXY0, Y0, ConstantFP::get(
X->getType(), 1.0));
713 ConstantFP::get(
X->getType(), 0.5));
715 return Builder.
CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
718bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
719 FastMathFlags SqrtFMF)
const {
725Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
727 const FastMathFlags SqrtFMF,
const Instruction *CtxI)
const {
738 bool IsNegative =
false;
743 IRBuilder<>::FastMathFlagGuard Guard(Builder);
748 canIgnoreDenormalInput(Den, CtxI)) {
759 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
773 Value *Den, FastMathFlags FMF,
774 const Instruction *CtxI)
const {
781 bool IsNegative =
false;
786 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
807 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
816 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
821 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
835Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
838 if (ReqdAccuracy < 2.5f)
844 bool NumIsOne =
false;
846 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
854 if (!HasFP32DenormalFlush && !NumIsOne)
857 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
860Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
862 FastMathFlags SqrtFMF,
Value *RsqOp,
const Instruction *FDivInst,
863 float ReqdDivAccuracy)
const {
866 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
874 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
882 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
886 return emitFrexpDiv(Builder, Num, Den, DivFMF);
904bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
905 if (DisableFDivExpand)
920 FastMathFlags SqrtFMF;
925 Value *RsqOp =
nullptr;
927 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
928 DenII->hasOneUse()) {
930 SqrtFMF = SqrtOp->getFastMathFlags();
931 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
932 RsqOp = SqrtOp->getOperand(0);
936 if (!IsFloat && !RsqOp)
948 const bool AllowInaccurateRcp = DivFMF.
approxFunc();
949 if (!RsqOp && AllowInaccurateRcp)
953 if (IsFloat && ReqdAccuracy < 1.0f)
960 SmallVector<Value *, 4> NumVals;
961 SmallVector<Value *, 4> DenVals;
962 SmallVector<Value *, 4> RsqDenVals;
969 SmallVector<Value *, 4> ResultVals(NumVals.
size());
970 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
971 Value *NumElt = NumVals[
I];
972 Value *DenElt = DenVals[
I];
973 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
976 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
985 NewEltInst->copyMetadata(FDiv);
988 ResultVals[
I] = NewElt;
996 DeadVals.push_back(&FDiv);
1007 Value *LHS_EXT64 = Builder.CreateZExt(
LHS, I64Ty);
1008 Value *RHS_EXT64 = Builder.CreateZExt(
RHS, I64Ty);
1009 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1010 Value *
Lo = Builder.CreateTrunc(MUL64, I32Ty);
1011 Value *
Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1012 Hi = Builder.CreateTrunc(
Hi, I32Ty);
1013 return std::pair(
Lo,
Hi);
1024unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &
I,
Value *Num,
1026 unsigned MaxDivBits,
1027 bool IsSigned)
const {
1034 unsigned DivBits = SSBits - RHSSignBits + 1;
1035 if (DivBits > MaxDivBits)
1040 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1041 DivBits = SSBits - SignBits + 1;
1049 if (RHSBits > MaxDivBits)
1055 unsigned DivBits = std::max(LHSBits, RHSBits);
1063 bool IsSigned)
const {
1064 unsigned DivBits = getDivNumBits(
I, Num, Den, 23, IsSigned);
1066 if (DivBits > (IsSigned ? 23 : 22))
1068 return expandDivRemToFloatImpl(Builder,
I, Num, Den, DivBits, IsDiv,
1072Value *AMDGPUCodeGenPrepareImpl::expandDivRemToFloatImpl(
1074 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1088 assert(0 < DivBits && DivBits <= (IsSigned ? 23 : 22) &&
1089 "abs(Num) must be <= than 0x40000 for expandDivRemToFloatImpl to work "
1097 ConstantInt *One = Builder.
getInt32(1);
1139 auto FMAD = !ST.hasMadMacF32Insts()
1178bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &
I,
1184 if (
C->getType()->getScalarSizeInBits() <= 32)
1200 if (BinOpDen->getOpcode() == Instruction::Shl &&
1218 return Builder.CreateAShr(V, Builder.getInt32(31));
1225 assert(
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1226 Opc == Instruction::SRem ||
Opc == Instruction::SDiv);
1232 if (divHasSpecialOptimization(
I,
X,
Y))
1235 bool IsDiv =
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv;
1236 bool IsSigned =
Opc == Instruction::SRem ||
Opc == Instruction::SDiv;
1238 Type *Ty =
X->getType();
1252 if (
Value *Res = expandDivRemToFloat(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1258 ConstantInt *One = Builder.
getInt32(1);
1260 Value *Sign =
nullptr;
1265 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1346 BinaryOperator &
I,
Value *Num,
1348 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1353 bool IsDiv =
Opc == Instruction::SDiv ||
Opc == Instruction::UDiv;
1354 bool IsSigned =
Opc == Instruction::SDiv ||
Opc == Instruction::SRem;
1356 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1357 if (NumDivBits > 32)
1360 Value *Narrowed =
nullptr;
1361 if (NumDivBits <= (IsSigned ? 23 : 22)) {
1362 Narrowed = expandDivRemToFloatImpl(Builder,
I, Num, Den, NumDivBits, IsDiv,
1364 }
else if (NumDivBits <= (IsSigned ? 31 : 32)) {
1369 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1380void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &
I)
const {
1383 if (
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv) {
1388 if (
Opc == Instruction::URem ||
Opc == Instruction::SRem) {
1408bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *
I) {
1409 unsigned Opc =
I->getOpcode();
1410 Type *OldType =
I->getType();
1412 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1417 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1419 "Instruction::Mul.");
1423 MaxBitsNeeded = std::max<unsigned>(
bit_ceil(MaxBitsNeeded), 8);
1424 Type *NewType =
DL.getSmallestLegalIntType(
I->getContext(), MaxBitsNeeded);
1428 if (NewBit >= OrigBit)
1440 int NumOfNonConstOps = 2;
1443 NumOfNonConstOps = 1;
1453 if (NewCost >= OldCost)
1464 DeadVals.push_back(
I);
1468bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &
I) {
1469 if (foldBinOpIntoSelect(
I))
1472 if (UseMul24Intrin && replaceMulWithMul24(
I))
1474 if (tryNarrowMathIfNoOverflow(&
I))
1479 Type *Ty =
I.getType();
1480 Value *NewDiv =
nullptr;
1485 if ((
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1486 Opc == Instruction::SRem ||
Opc == Instruction::SDiv) &&
1488 !DisableIDivExpand) {
1489 Value *Num =
I.getOperand(0);
1490 Value *Den =
I.getOperand(1);
1497 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1502 if (ScalarSize <= 32) {
1503 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1509 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1523 NewEltI->copyIRFlags(&
I);
1528 if (ScalarSize <= 32)
1529 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1531 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1538 I.replaceAllUsesWith(NewDiv);
1539 DeadVals.push_back(&
I);
1544 if (ExpandDiv64InIR) {
1546 for (BinaryOperator *Div : Div64ToExpand) {
1547 expandDivRem64(*Div);
1556bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &
I) {
1562 canWidenScalarExtLoad(
I)) {
1573 if (
auto *
Range =
I.getMetadata(LLVMContext::MD_range)) {
1576 if (!
Lower->isNullValue()) {
1583 WidenLoad->setMetadata(LLVMContext::MD_range,
1588 int TySize =
DL.getTypeSizeInBits(
I.getType());
1593 DeadVals.push_back(&
I);
1600bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &
I) {
1606 Value *Fract =
nullptr;
1615 Value *FractSrc = matchFractPatImpl(*
X, *
C);
1620 Fract = applyFractPat(Builder, FractSrc);
1630 CmpPredicate IsNanPred;
1639 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1640 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1642 Fract = applyFractPat(Builder, CmpVal);
1643 }
else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1644 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1646 Fract = applyFractPat(Builder, CmpVal);
1650 CmpPredicate PredInf;
1656 PredInf != FCmpInst::FCMP_UNE ||
1657 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1667 Value *NewFract = applyFractPat(Builder, CmpVal);
1671 DeadVals.push_back(ClampInfSelect->
getOperand(1));
1675 Fract = ClampInfSelect;
1682 I.replaceAllUsesWith(Fract);
1683 DeadVals.push_back(&
I);
1690 return IA && IB && IA->getParent() == IB->getParent();
1700 const Value *CurVal = V;
1703 BitVector EltsCovered(FVT->getNumElements());
1710 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1713 const auto *VecSrc = IE->getOperand(0);
1722 EltsCovered.
set(Idx->getZExtValue());
1725 if (EltsCovered.
all())
1752 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1756 for (
const Value *Inc :
I.incoming_values()) {
1761 for (
const User *U :
I.users()) {
1767bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1769 if (
const auto It = BreakPhiNodesCache.find(&
I);
1770 It != BreakPhiNodesCache.end())
1779 SmallPtrSet<const PHINode *, 8> WorkList;
1785 for (
const PHINode *WLP : WorkList) {
1786 assert(BreakPhiNodesCache.count(WLP) == 0);
1801 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1802 unsigned NumBreakablePHIs = 0;
1803 bool CanBreak =
false;
1804 for (
const PHINode *Cur : WorkList) {
1812 if (++NumBreakablePHIs >= Threshold) {
1819 for (
const PHINode *Cur : WorkList)
1820 BreakPhiNodesCache[Cur] = CanBreak;
1869 Value *&Res = SlicedVals[{BB, Inc}];
1875 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1881 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1883 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1892bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &
I) {
1904 cl::boolOrDefault::BOU_TRUE)
1909 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1912 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1915 std::vector<VectorSlice> Slices;
1922 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1924 if (EltSize == 8 || EltSize == 16) {
1925 const unsigned SubVecSize = (32 / EltSize);
1927 for (
unsigned End =
alignDown(NumElts, SubVecSize); Idx < End;
1929 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1933 for (; Idx < NumElts; ++Idx)
1934 Slices.emplace_back(EltTy, Idx, 1);
1937 assert(Slices.size() > 1);
1943 B.SetCurrentDebugLocation(
I.getDebugLoc());
1945 unsigned IncNameSuffix = 0;
1946 for (VectorSlice &S : Slices) {
1949 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1950 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1952 for (
const auto &[Idx, BB] :
enumerate(
I.blocks())) {
1953 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(Idx),
1954 "largephi.extractslice" +
1955 std::to_string(IncNameSuffix++)),
1962 unsigned NameSuffix = 0;
1963 for (VectorSlice &S : Slices) {
1964 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
1966 Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1968 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1971 I.replaceAllUsesWith(Vec);
1972 DeadVals.push_back(&
I);
1995 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
2014 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
2015 assert((NullVal == 0 || NullVal == -1) &&
2016 "don't know how to check for this null value!");
2017 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2020bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &
I) {
2024 if (
I.getType()->isVectorTy())
2029 const unsigned SrcAS =
I.getSrcAddressSpace();
2030 const unsigned DstAS =
I.getDestAddressSpace();
2032 bool CanLower =
false;
2050 auto *Intrin =
B.CreateIntrinsic(
2051 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2052 I.replaceAllUsesWith(Intrin);
2053 DeadVals.push_back(&
I);
2057bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &
I) {
2060 case Intrinsic::minnum:
2061 case Intrinsic::minimumnum:
2062 case Intrinsic::minimum:
2063 return visitFMinLike(
I);
2064 case Intrinsic::sqrt:
2065 return visitSqrt(
I);
2066 case Intrinsic::log:
2067 case Intrinsic::log10:
2069 case Intrinsic::log2:
2072 case Intrinsic::amdgcn_mbcnt_lo:
2073 return visitMbcntLo(
I);
2074 case Intrinsic::amdgcn_mbcnt_hi:
2075 return visitMbcntHi(
I);
2076 case Intrinsic::vector_reduce_add:
2077 return visitVectorReduceAdd(
I);
2078 case Intrinsic::uadd_sat:
2079 case Intrinsic::sadd_sat:
2080 return visitSaturatingAdd(
I);
2088Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(
Value &FractSrc,
2089 const APFloat &
C)
const {
2098 OneNextDown.
next(
true);
2101 if (OneNextDown !=
C)
2121Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(
Value &V) {
2133 return matchFractPatImpl(*Arg0, *
C);
2138 SmallVector<Value *, 4> FractVals;
2141 SmallVector<Value *, 4> ResultVals(FractVals.
size());
2144 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2152bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &
I) {
2160 FractArg = matchFractPatImpl(*
X, *
C);
2165 FractArg = matchFractPatNanAvoidant(
I);
2177 FastMathFlags FMF =
I.getFastMathFlags();
2181 Value *Fract = applyFractPat(Builder, FractArg);
2183 I.replaceAllUsesWith(Fract);
2184 DeadVals.push_back(&
I);
2189bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2205 if (ReqdAccuracy < 1.0f)
2209 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2213 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2217 SmallVector<Value *, 4> SrcVals;
2220 SmallVector<Value *, 4> ResultVals(SrcVals.
size());
2221 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2223 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2225 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2231 DeadVals.push_back(&Sqrt);
2236bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2242 FastMathFlags FMF =
Log.getFastMathFlags();
2249 if (
Log.getFPAccuracy() < 1.80f)
2260 double Log2BaseInverted =
2267 Log.replaceAllUsesWith(
Mul);
2268 DeadVals.push_back(&Log);
2272bool AMDGPUCodeGenPrepare::runOnFunction(Function &
F) {
2273 if (skipFunction(
F))
2276 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2280 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2281 const TargetLibraryInfo *TLI =
2282 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2283 AssumptionCache *AC =
2284 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2285 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2286 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2288 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2289 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2299 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2303 if (!Impl.FlowChanged)
2309 "AMDGPU IR optimizations",
false,
false)
2319 B.CreateIntrinsicWithoutFolding(Intrinsic::amdgcn_workitem_id_x, {});
2320 ST.makeLIDRangeMetadata(Tid);
2325void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &
I)
const {
2327 CallInst *Tid = createWorkitemIdX(
B);
2333void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2334 Instruction &
I,
unsigned WaveSize)
const {
2336 CallInst *Tid = createWorkitemIdX(
B);
2338 Value *AndInst =
B.CreateAnd(Tid, Mask);
2346bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &
I,
2347 unsigned Wave)
const {
2354 if (*MaybeX == Wave) {
2355 replaceWithWorkitemIdX(
I);
2362 replaceWithMaskedWorkitemIdX(
I, Wave);
2370bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &
I)
const {
2386bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &
I)
const {
2399 if (*MaybeX == Wave) {
2410 using namespace PatternMatch;
2418 return tryReplaceWithWorkitemId(
I, Wave);
2444 Value *ExtSrc0, *ExtSrc1;
2464bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &
I) {
2466 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2469 Value *
A =
nullptr, *
B =
nullptr;
2472 bool IsSigned =
false;
2479 LLVMContext &Ctx =
I.getContext();
2480 Type *I32Ty = Type::getInt32Ty(Ctx);
2488 Value *Acc = ConstantInt::get(I32Ty, 0);
2492 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2497 I.replaceAllUsesWith(Dot);
2498 DeadVals.push_back(&
I);
2506bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &
I) {
2508 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2512 bool IsSigned = (IID == Intrinsic::sadd_sat);
2515 Value *Op0 =
I.getArgOperand(0);
2516 Value *Op1 =
I.getArgOperand(1);
2517 Value *MulOp =
nullptr;
2518 Value *Accum =
nullptr;
2519 IntrinsicInst *ReduceInst =
nullptr;
2524 }
else if (
match(Op1,
2532 Value *
A =
nullptr, *
B =
nullptr;
2537 LLVMContext &Ctx =
I.getContext();
2538 Type *I32Ty = Type::getInt32Ty(Ctx);
2549 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2554 I.replaceAllUsesWith(Dot);
2555 DeadVals.push_back(&
I);
2558 DeadVals.push_back(ReduceInst);
2563char AMDGPUCodeGenPrepare::ID = 0;
2566 return new AMDGPUCodeGenPrepare();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B, bool IsSigned)
Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x i8>) Returns true if pattern...
static bool isV4I8(Type *Ty)
Check if type is <4 x i8>.
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
opStatus next(bool nextDown)
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
BinaryOps getOpcode() const
BitVector & set()
Set all bits in the bitvector.
bool all() const
Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false, MDNode *FPMathTag=nullptr)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFAbs(Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fabs intrinsic.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
void copyMetadataForWidenedLoad(LoadInst &Dest, const LoadInst &Source)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
SimplifyQuery getWithInstruction(const Instruction *I) const