27#include "llvm/IR/IntrinsicsAMDGPU.h"
35#define DEBUG_TYPE "amdgpu-codegenprepare"
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc(
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
56 cl::desc(
"Break large PHI nodes for DAGISel"),
60 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc(
"For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
71 "amdgpu-codegenprepare-mul24",
72 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98class AMDGPUCodeGenPrepareImpl
99 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
108 bool HasUnsafeFPMath =
false;
109 bool HasFP32DenormalFlush =
false;
110 bool FlowChanged =
false;
111 mutable Function *SqrtF32 =
nullptr;
112 mutable Function *LdexpF32 =
nullptr;
136 bool canBreakPHINode(
const PHINode &
I);
143 unsigned getBaseElementBitWidth(
const Type *
T)
const;
160 bool needsPromotionToI32(
const Type *
T)
const;
163 bool isLegalFloatingTy(
const Type *
T)
const;
172 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
173 return HasFP32DenormalFlush ||
197 bool promoteUniformOpToI32(
ICmpInst &
I)
const;
226 unsigned numBitsUnsigned(
Value *
Op)
const;
231 unsigned numBitsSigned(
Value *
Op)
const;
245 unsigned AtLeast,
bool Signed)
const;
250 bool IsDiv,
bool IsSigned)
const;
254 bool IsDiv,
bool IsSigned)
const;
272 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
287 float ReqdAccuracy)
const;
292 float ReqdAccuracy)
const;
294 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
298 bool IsNegative)
const;
323 AMDGPUCodeGenPrepareImpl Impl;
336 if (!ExpandDiv64InIR)
346bool AMDGPUCodeGenPrepareImpl::run(
Function &
F) {
347 BreakPhiNodesCache.clear();
348 bool MadeChange =
false;
353 NextBB = std::next(FI);
360 MadeChange |= visit(*
I);
364 if (NextInstBB != BB) {
375unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(
const Type *
T)
const {
376 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
378 if (
T->isIntegerTy())
379 return T->getIntegerBitWidth();
380 return cast<VectorType>(
T)->getElementType()->getIntegerBitWidth();
384 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
386 if (
T->isIntegerTy())
387 return B.getInt32Ty();
391bool AMDGPUCodeGenPrepareImpl::isSigned(
const BinaryOperator &
I)
const {
392 return I.getOpcode() == Instruction::AShr ||
393 I.getOpcode() == Instruction::SDiv ||
I.getOpcode() == Instruction::SRem;
396bool AMDGPUCodeGenPrepareImpl::isSigned(
const SelectInst &
I)
const {
397 return isa<ICmpInst>(
I.getOperand(0)) ?
398 cast<ICmpInst>(
I.getOperand(0))->isSigned() :
false;
401bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(
const Type *
T)
const {
409 if (
const VectorType *VT = dyn_cast<VectorType>(
T)) {
412 if (
ST->hasVOP3PInsts())
415 return needsPromotionToI32(VT->getElementType());
421bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
428 switch (
I.getOpcode()) {
429 case Instruction::Shl:
430 case Instruction::Add:
431 case Instruction::Sub:
433 case Instruction::Mul:
434 return I.hasNoUnsignedWrap();
442 switch (
I.getOpcode()) {
443 case Instruction::Shl:
444 case Instruction::Add:
445 case Instruction::Mul:
447 case Instruction::Sub:
448 return I.hasNoUnsignedWrap();
454bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(
LoadInst &
I)
const {
455 Type *Ty =
I.getType();
457 int TySize =
DL.getTypeSizeInBits(Ty);
458 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
460 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&
I);
463bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
BinaryOperator &
I)
const {
464 assert(needsPromotionToI32(
I.getType()) &&
465 "I does not need promotion to i32");
467 if (
I.getOpcode() == Instruction::SDiv ||
468 I.getOpcode() == Instruction::UDiv ||
469 I.getOpcode() == Instruction::SRem ||
470 I.getOpcode() == Instruction::URem)
474 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
476 Type *I32Ty = getI32Ty(Builder,
I.getType());
477 Value *ExtOp0 =
nullptr;
478 Value *ExtOp1 =
nullptr;
479 Value *ExtRes =
nullptr;
480 Value *TruncRes =
nullptr;
483 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
484 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
486 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
487 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
490 ExtRes = Builder.CreateBinOp(
I.getOpcode(), ExtOp0, ExtOp1);
491 if (
Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
493 Inst->setHasNoSignedWrap();
496 Inst->setHasNoUnsignedWrap();
498 if (
const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&
I))
499 Inst->setIsExact(ExactOp->isExact());
502 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
504 I.replaceAllUsesWith(TruncRes);
510bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
ICmpInst &
I)
const {
511 assert(needsPromotionToI32(
I.getOperand(0)->getType()) &&
512 "I does not need promotion to i32");
515 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
517 Type *I32Ty = getI32Ty(Builder,
I.getOperand(0)->getType());
518 Value *ExtOp0 =
nullptr;
519 Value *ExtOp1 =
nullptr;
520 Value *NewICmp =
nullptr;
523 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
524 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
526 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
527 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
529 NewICmp = Builder.CreateICmp(
I.getPredicate(), ExtOp0, ExtOp1);
531 I.replaceAllUsesWith(NewICmp);
537bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
SelectInst &
I)
const {
538 assert(needsPromotionToI32(
I.getType()) &&
539 "I does not need promotion to i32");
542 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
544 Type *I32Ty = getI32Ty(Builder,
I.getType());
545 Value *ExtOp1 =
nullptr;
546 Value *ExtOp2 =
nullptr;
547 Value *ExtRes =
nullptr;
548 Value *TruncRes =
nullptr;
551 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
552 ExtOp2 = Builder.CreateSExt(
I.getOperand(2), I32Ty);
554 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
555 ExtOp2 = Builder.CreateZExt(
I.getOperand(2), I32Ty);
557 ExtRes = Builder.CreateSelect(
I.getOperand(0), ExtOp1, ExtOp2);
558 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
560 I.replaceAllUsesWith(TruncRes);
566bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
568 assert(
I.getIntrinsicID() == Intrinsic::bitreverse &&
569 "I must be bitreverse intrinsic");
570 assert(needsPromotionToI32(
I.getType()) &&
571 "I does not need promotion to i32");
574 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
576 Type *I32Ty = getI32Ty(Builder,
I.getType());
579 Value *ExtOp = Builder.CreateZExt(
I.getOperand(0), I32Ty);
580 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
582 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(
I.getType()));
584 Builder.CreateTrunc(LShrOp,
I.getType());
586 I.replaceAllUsesWith(TruncRes);
592unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op)
const {
596unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op)
const {
602 auto *VT = dyn_cast<FixedVectorType>(V->getType());
608 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
621 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
627bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(
BinaryOperator &
I)
const {
628 if (
I.getOpcode() != Instruction::Mul)
631 Type *Ty =
I.getType();
633 if (Size <= 16 && ST->has16BitInsts())
637 if (UA->isUniform(&
I))
643 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
645 unsigned LHSBits = 0, RHSBits = 0;
646 bool IsSigned =
false;
648 if (
ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
649 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
652 }
else if (
ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
653 (RHSBits = numBitsSigned(RHS)) <= 24) {
667 Type *DstTy = LHSVals[0]->getType();
669 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
670 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
671 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
672 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
673 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
675 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
677 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
678 : Builder.CreateZExtOrTrunc(Result, DstTy);
684 I.replaceAllUsesWith(NewVal);
694 if (
SelectInst *Sel = dyn_cast<SelectInst>(V))
697 if ((Cast = dyn_cast<CastInst>(V))) {
705bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(
BinaryOperator &BO)
const {
726 if (!CBO || !CT || !CF)
741 if (!FoldedT || isa<ConstantExpr>(FoldedT))
747 if (!FoldedF || isa<ConstantExpr>(FoldedF))
752 if (
const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
753 Builder.setFastMathFlags(FPOp->getFastMathFlags());
766std::pair<Value *, Value *>
767AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
769 Type *Ty = Src->getType();
783 return {FrexpMant, FrexpExp};
789 bool IsNegative)
const {
804 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
807 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
817 if (HasFP32DenormalFlush &&
ST->hasFractBug() && !
ST->hasFastFMAF32() &&
823 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
828 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
841 Type *Ty = Src->getType();
848 Value *InputScaleFactor =
855 Value *OutputScaleFactor =
857 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
868 Type *Ty = Src->getType();
885 return Builder.
CreateFMul(Rsq, OutputScaleFactor);
888bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(
const FPMathOperator *SqrtOp,
896 return SqrtFMF.
approxFunc() || HasUnsafeFPMath ||
900Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
909 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
915 bool IsNegative =
false;
924 canIgnoreDenormalInput(Den, CtxI)) {
951 if (
const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
952 bool IsNegative =
false;
957 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
978 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
987 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
992 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
1006Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1009 if (ReqdAccuracy < 2.5f)
1015 bool NumIsOne =
false;
1016 if (
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1017 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1025 if (!HasFP32DenormalFlush && !NumIsOne)
1028 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1031Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1034 float ReqdDivAccuracy)
const {
1037 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1042 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1050 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1054 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1073 if (DisableFDivExpand)
1092 Value *RsqOp =
nullptr;
1093 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1094 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1095 DenII->hasOneUse()) {
1096 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1098 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1111 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.
approxFunc();
1112 if (!RsqOp && AllowInaccurateRcp)
1116 if (ReqdAccuracy < 1.0f)
1133 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
1134 Value *NumElt = NumVals[
I];
1135 Value *DenElt = DenVals[
I];
1136 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
1139 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1140 cast<Instruction>(FPOp), ReqdAccuracy);
1147 if (
auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1148 NewEltInst->copyMetadata(FDiv);
1151 ResultVals[
I] = NewElt;
1166 Attribute Attr =
F.getFnAttribute(
"unsafe-fp-math");
1181 return std::pair(
Lo,
Hi);
1192 Value *Den,
unsigned AtLeast,
1193 bool IsSigned)
const {
1196 if (LHSSignBits < AtLeast)
1200 if (RHSSignBits < AtLeast)
1203 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1214 Value *Den,
bool IsDiv,
1215 bool IsSigned)
const {
1216 int DivBits = getDivNumBits(
I, Num, Den, 9, IsSigned);
1219 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1222Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1224 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1271 auto FMAD = !
ST->hasMadMacF32Insts()
1275 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1303 if (DivBits != 0 && DivBits < 32) {
1306 int InRegBits = 32 - DivBits;
1308 Res = Builder.
CreateShl(Res, InRegBits);
1312 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1313 Res = Builder.
CreateAnd(Res, TruncMask);
1324bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(
BinaryOperator &
I,
1327 if (
Constant *
C = dyn_cast<Constant>(Den)) {
1330 if (
C->getType()->getScalarSizeInBits() <= 32)
1346 if (BinOpDen->getOpcode() == Instruction::Shl &&
1347 isa<Constant>(BinOpDen->getOperand(0)) &&
1371 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1372 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1378 if (divHasSpecialOptimization(
I,
X,
Y))
1381 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1382 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1384 Type *Ty =
X->getType();
1398 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1406 Value *Sign =
nullptr;
1411 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1495 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1500 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1501 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1503 int NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1504 if (NumDivBits == -1)
1507 Value *Narrowed =
nullptr;
1508 if (NumDivBits <= 24) {
1509 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1511 }
else if (NumDivBits <= 32) {
1512 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1523void AMDGPUCodeGenPrepareImpl::expandDivRem64(
BinaryOperator &
I)
const {
1526 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1531 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1539bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(
BinaryOperator &
I) {
1540 if (foldBinOpIntoSelect(
I))
1543 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1544 UA->isUniform(&
I) && promoteUniformOpToI32(
I))
1547 if (UseMul24Intrin && replaceMulWithMul24(
I))
1550 bool Changed =
false;
1552 Type *Ty =
I.getType();
1553 Value *NewDiv =
nullptr;
1558 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1559 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1561 !DisableIDivExpand) {
1562 Value *Num =
I.getOperand(0);
1563 Value *Den =
I.getOperand(1);
1567 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1570 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1575 if (ScalarSize <= 32) {
1576 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1578 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1582 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1587 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1588 Div64ToExpand.
push_back(cast<BinaryOperator>(NewElt));
1595 if (ScalarSize <= 32)
1596 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1598 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1605 I.replaceAllUsesWith(NewDiv);
1606 I.eraseFromParent();
1611 if (ExpandDiv64InIR) {
1614 expandDivRem64(*Div);
1623bool AMDGPUCodeGenPrepareImpl::visitLoadInst(
LoadInst &
I) {
1629 canWidenScalarExtLoad(
I)) {
1639 if (
auto *Range = WidenLoad->
getMetadata(LLVMContext::MD_range)) {
1641 mdconst::extract<ConstantInt>(
Range->getOperand(0));
1643 if (
Lower->isNullValue()) {
1644 WidenLoad->
setMetadata(LLVMContext::MD_range,
nullptr);
1661 I.replaceAllUsesWith(ValOrig);
1662 I.eraseFromParent();
1669bool AMDGPUCodeGenPrepareImpl::visitICmpInst(
ICmpInst &
I) {
1670 bool Changed =
false;
1672 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getOperand(0)->getType()) &&
1674 Changed |= promoteUniformOpToI32(
I);
1679bool AMDGPUCodeGenPrepareImpl::visitSelectInst(
SelectInst &
I) {
1686 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType())) {
1687 if (UA->isUniform(&
I))
1688 return promoteUniformOpToI32(
I);
1703 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1704 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1706 Value *Fract =
nullptr;
1707 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1708 CmpVal == matchFractPat(*IIFalse)) {
1710 Fract = applyFractPat(Builder, CmpVal);
1711 }
else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1712 CmpVal == matchFractPat(*IITrue)) {
1714 Fract = applyFractPat(Builder, CmpVal);
1719 I.replaceAllUsesWith(Fract);
1725 const auto *IA = dyn_cast<Instruction>(
A);
1726 const auto *IB = dyn_cast<Instruction>(
B);
1727 return IA && IB && IA->getParent() == IB->getParent();
1733 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1737 const Value *CurVal = V;
1740 BitVector EltsCovered(FVT->getNumElements());
1741 while (
const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1742 const auto *
Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1747 if (!
Idx ||
Idx->getSExtValue() >= FVT->getNumElements())
1750 const auto *VecSrc = IE->getOperand(0);
1755 if (isa<Instruction>(VecSrc) && !
areInSameBB(VecSrc, IE))
1759 EltsCovered.
set(
Idx->getSExtValue());
1762 if (EltsCovered.
all())
1771 if (isa<Constant>(CurVal))
1778 if (
const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1779 return isa<Constant>(SV->getOperand(1)) ||
1789 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1793 for (
const Value *Inc :
I.incoming_values()) {
1794 if (
const auto *PhiInc = dyn_cast<PHINode>(Inc))
1798 for (
const User *U :
I.users()) {
1799 if (
const auto *PhiU = dyn_cast<PHINode>(U))
1804bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1806 if (
const auto It = BreakPhiNodesCache.find(&
I);
1807 It != BreakPhiNodesCache.end())
1822 for (
const PHINode *WLP : WorkList) {
1823 assert(BreakPhiNodesCache.count(WLP) == 0);
1838 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1839 unsigned NumBreakablePHIs = 0;
1840 bool CanBreak =
false;
1841 for (
const PHINode *Cur : WorkList) {
1849 if (++NumBreakablePHIs >= Threshold) {
1856 for (
const PHINode *Cur : WorkList)
1857 BreakPhiNodesCache[Cur] = CanBreak;
1906 Value *&Res = SlicedVals[{BB, Inc}];
1911 if (
Instruction *IncInst = dyn_cast<Instruction>(Inc))
1912 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1918 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1920 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1929bool AMDGPUCodeGenPrepareImpl::visitPHINode(
PHINode &
I) {
1945 DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1948 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1951 std::vector<VectorSlice> Slices;
1958 const unsigned EltSize =
DL->getTypeSizeInBits(EltTy);
1960 if (EltSize == 8 || EltSize == 16) {
1961 const unsigned SubVecSize = (32 / EltSize);
1965 Slices.emplace_back(SubVecTy,
Idx, SubVecSize);
1969 for (;
Idx < NumElts; ++
Idx)
1970 Slices.emplace_back(EltTy,
Idx, 1);
1973 assert(Slices.size() > 1);
1979 B.SetCurrentDebugLocation(
I.getDebugLoc());
1981 unsigned IncNameSuffix = 0;
1985 B.SetInsertPoint(
I.getParent()->getFirstNonPHI());
1986 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1989 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(
Idx),
1990 "largephi.extractslice" +
1991 std::to_string(IncNameSuffix++)),
1998 unsigned NameSuffix = 0;
2000 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
2003 B.CreateInsertVector(FVT, Vec, S.NewPHI,
B.getInt64(S.Idx), ValName);
2005 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2008 I.replaceAllUsesWith(Vec);
2009 I.eraseFromParent();
2013bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(
IntrinsicInst &
I) {
2014 switch (
I.getIntrinsicID()) {
2015 case Intrinsic::bitreverse:
2016 return visitBitreverseIntrinsicInst(
I);
2017 case Intrinsic::minnum:
2018 return visitMinNum(
I);
2019 case Intrinsic::sqrt:
2020 return visitSqrt(
I);
2026bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(
IntrinsicInst &
I) {
2027 bool Changed =
false;
2029 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
2031 Changed |= promoteUniformBitreverseToI32(
I);
2042 if (
ST->hasFractBug())
2045 if (
I.getIntrinsicID() != Intrinsic::minnum)
2048 Type *Ty =
I.getType();
2052 Value *Arg0 =
I.getArgOperand(0);
2053 Value *Arg1 =
I.getArgOperand(1);
2061 One.convert(
C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2070 m_Intrinsic<Intrinsic::floor>(
m_Deferred(FloorSrc)))))
2083 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2092 Value *FractArg = matchFractPat(
I);
2106 Value *Fract = applyFractPat(Builder, FractArg);
2108 I.replaceAllUsesWith(Fract);
2120bool AMDGPUCodeGenPrepareImpl::visitSqrt(
IntrinsicInst &Sqrt) {
2136 if (ReqdAccuracy < 1.0f)
2144 if (FDiv && FDiv->
getOpcode() == Instruction::FDiv &&
2145 FDiv->getFPAccuracy() >= 1.0f &&
2152 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2156 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2164 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2166 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2168 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2178bool AMDGPUCodeGenPrepare::doInitialization(
Module &M) {
2180 Impl.DL = &Impl.Mod->getDataLayout();
2181 Impl.SqrtF32 =
nullptr;
2182 Impl.LdexpF32 =
nullptr;
2186bool AMDGPUCodeGenPrepare::runOnFunction(
Function &
F) {
2187 if (skipFunction(
F))
2190 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2195 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2197 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2198 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2199 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2200 Impl.DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2203 Impl.HasFP32DenormalFlush =
2210 AMDGPUCodeGenPrepareImpl Impl;
2211 Impl.Mod =
F.getParent();
2212 Impl.DL = &Impl.Mod->getDataLayout();
2220 Impl.HasFP32DenormalFlush =
2223 if (!Impl.FlowChanged)
2229 "AMDGPU IR optimizations",
false,
false)
2236char AMDGPUCodeGenPrepare::
ID = 0;
2239 return new AMDGPUCodeGenPrepare();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool hasUnsafeFPMath(const Function &F)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
support::ulittle16_t & Lo
support::ulittle16_t & Hi
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
ConstantFP - Floating Point Values [float, double].
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
This instruction compares its operands according to the predicate given to the constructor.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="")
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitPHINode(PHINode &I)
RetTy visitBinaryOperator(BinaryOperator &I)
RetTy visitICmpInst(ICmpInst &I)
RetTy visitSelectInst(SelectInst &I)
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
A Module instance is used to store all the information related to an LLVM module.
LLVMContext & getContext() const
Get the global data context.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
static IntegerType * getInt32Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
CmpClass_match< LHS, RHS, FCmpInst, FCmpInst::Predicate > m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
bool isKnownNeverNaN(const Value *V, const DataLayout &DL, const TargetLibraryInfo *TLI, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, const DataLayout &DL, FPClassTest InterestedClasses=fcAllFlags, unsigned Depth=0, const TargetLibraryInfo *TLI=nullptr, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
CGPassBuilderOption getCGPassBuilderOption()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.