27#include "llvm/IR/IntrinsicsAMDGPU.h"
35#define DEBUG_TYPE "amdgpu-codegenprepare"
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc(
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
56 cl::desc(
"Break large PHI nodes for DAGISel"),
60 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc(
"For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
71 "amdgpu-codegenprepare-mul24",
72 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98class AMDGPUCodeGenPrepareImpl
99 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
109 bool HasUnsafeFPMath =
false;
110 bool HasFP32DenormalFlush =
false;
111 bool FlowChanged =
false;
112 mutable Function *SqrtF32 =
nullptr;
113 mutable Function *LdexpF32 =
nullptr;
137 bool canBreakPHINode(
const PHINode &
I);
144 unsigned getBaseElementBitWidth(
const Type *
T)
const;
161 bool needsPromotionToI32(
const Type *
T)
const;
164 bool isLegalFloatingTy(
const Type *
T)
const;
173 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
174 return HasFP32DenormalFlush ||
198 bool promoteUniformOpToI32(
ICmpInst &
I)
const;
227 unsigned numBitsUnsigned(
Value *
Op)
const;
232 unsigned numBitsSigned(
Value *
Op)
const;
246 unsigned AtLeast,
bool Signed)
const;
251 bool IsDiv,
bool IsSigned)
const;
255 bool IsDiv,
bool IsSigned)
const;
273 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
288 float ReqdAccuracy)
const;
293 float ReqdAccuracy)
const;
295 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
299 bool IsNegative)
const;
325 AMDGPUCodeGenPrepareImpl Impl;
338 if (!ExpandDiv64InIR)
348bool AMDGPUCodeGenPrepareImpl::run(
Function &
F) {
349 BreakPhiNodesCache.clear();
350 bool MadeChange =
false;
355 NextBB = std::next(FI);
362 MadeChange |= visit(*
I);
366 if (NextInstBB != BB) {
377unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(
const Type *
T)
const {
378 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
380 if (
T->isIntegerTy())
381 return T->getIntegerBitWidth();
382 return cast<VectorType>(
T)->getElementType()->getIntegerBitWidth();
386 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
388 if (
T->isIntegerTy())
389 return B.getInt32Ty();
393bool AMDGPUCodeGenPrepareImpl::isSigned(
const BinaryOperator &
I)
const {
394 return I.getOpcode() == Instruction::AShr ||
395 I.getOpcode() == Instruction::SDiv ||
I.getOpcode() == Instruction::SRem;
398bool AMDGPUCodeGenPrepareImpl::isSigned(
const SelectInst &
I)
const {
399 return isa<ICmpInst>(
I.getOperand(0)) ?
400 cast<ICmpInst>(
I.getOperand(0))->isSigned() :
false;
403bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(
const Type *
T)
const {
411 if (
const VectorType *VT = dyn_cast<VectorType>(
T)) {
414 if (
ST->hasVOP3PInsts())
417 return needsPromotionToI32(VT->getElementType());
423bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
430 switch (
I.getOpcode()) {
431 case Instruction::Shl:
432 case Instruction::Add:
433 case Instruction::Sub:
435 case Instruction::Mul:
436 return I.hasNoUnsignedWrap();
444 switch (
I.getOpcode()) {
445 case Instruction::Shl:
446 case Instruction::Add:
447 case Instruction::Mul:
449 case Instruction::Sub:
450 return I.hasNoUnsignedWrap();
456bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(
LoadInst &
I)
const {
457 Type *Ty =
I.getType();
459 int TySize =
DL.getTypeSizeInBits(Ty);
460 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
462 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&
I);
465bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
BinaryOperator &
I)
const {
466 assert(needsPromotionToI32(
I.getType()) &&
467 "I does not need promotion to i32");
469 if (
I.getOpcode() == Instruction::SDiv ||
470 I.getOpcode() == Instruction::UDiv ||
471 I.getOpcode() == Instruction::SRem ||
472 I.getOpcode() == Instruction::URem)
476 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
478 Type *I32Ty = getI32Ty(Builder,
I.getType());
479 Value *ExtOp0 =
nullptr;
480 Value *ExtOp1 =
nullptr;
481 Value *ExtRes =
nullptr;
482 Value *TruncRes =
nullptr;
485 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
486 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
488 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
489 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
492 ExtRes = Builder.CreateBinOp(
I.getOpcode(), ExtOp0, ExtOp1);
493 if (
Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
495 Inst->setHasNoSignedWrap();
498 Inst->setHasNoUnsignedWrap();
500 if (
const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&
I))
501 Inst->setIsExact(ExactOp->isExact());
504 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
506 I.replaceAllUsesWith(TruncRes);
512bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
ICmpInst &
I)
const {
513 assert(needsPromotionToI32(
I.getOperand(0)->getType()) &&
514 "I does not need promotion to i32");
517 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
519 Type *I32Ty = getI32Ty(Builder,
I.getOperand(0)->getType());
520 Value *ExtOp0 =
nullptr;
521 Value *ExtOp1 =
nullptr;
522 Value *NewICmp =
nullptr;
525 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
526 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
528 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
529 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
531 NewICmp = Builder.CreateICmp(
I.getPredicate(), ExtOp0, ExtOp1);
533 I.replaceAllUsesWith(NewICmp);
539bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
SelectInst &
I)
const {
540 assert(needsPromotionToI32(
I.getType()) &&
541 "I does not need promotion to i32");
544 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
546 Type *I32Ty = getI32Ty(Builder,
I.getType());
547 Value *ExtOp1 =
nullptr;
548 Value *ExtOp2 =
nullptr;
549 Value *ExtRes =
nullptr;
550 Value *TruncRes =
nullptr;
553 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
554 ExtOp2 = Builder.CreateSExt(
I.getOperand(2), I32Ty);
556 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
557 ExtOp2 = Builder.CreateZExt(
I.getOperand(2), I32Ty);
559 ExtRes = Builder.CreateSelect(
I.getOperand(0), ExtOp1, ExtOp2);
560 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
562 I.replaceAllUsesWith(TruncRes);
568bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
570 assert(
I.getIntrinsicID() == Intrinsic::bitreverse &&
571 "I must be bitreverse intrinsic");
572 assert(needsPromotionToI32(
I.getType()) &&
573 "I does not need promotion to i32");
576 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
578 Type *I32Ty = getI32Ty(Builder,
I.getType());
581 Value *ExtOp = Builder.CreateZExt(
I.getOperand(0), I32Ty);
582 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
584 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(
I.getType()));
586 Builder.CreateTrunc(LShrOp,
I.getType());
588 I.replaceAllUsesWith(TruncRes);
594unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op)
const {
598unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op)
const {
604 auto *VT = dyn_cast<FixedVectorType>(V->getType());
610 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
623 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
629bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(
BinaryOperator &
I)
const {
630 if (
I.getOpcode() != Instruction::Mul)
633 Type *Ty =
I.getType();
635 if (Size <= 16 && ST->has16BitInsts())
639 if (UA->isUniform(&
I))
645 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
647 unsigned LHSBits = 0, RHSBits = 0;
648 bool IsSigned =
false;
650 if (
ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
651 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
654 }
else if (
ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
655 (RHSBits = numBitsSigned(RHS)) <= 24) {
669 Type *DstTy = LHSVals[0]->getType();
671 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
672 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
673 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
674 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
675 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
677 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
679 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
680 : Builder.CreateZExtOrTrunc(Result, DstTy);
686 I.replaceAllUsesWith(NewVal);
696 if (
SelectInst *Sel = dyn_cast<SelectInst>(V))
699 if ((Cast = dyn_cast<CastInst>(V))) {
707bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(
BinaryOperator &BO)
const {
728 if (!CBO || !CT || !CF)
743 if (!FoldedT || isa<ConstantExpr>(FoldedT))
749 if (!FoldedF || isa<ConstantExpr>(FoldedF))
754 if (
const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
755 Builder.setFastMathFlags(FPOp->getFastMathFlags());
768std::pair<Value *, Value *>
769AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
771 Type *Ty = Src->getType();
785 return {FrexpMant, FrexpExp};
791 bool IsNegative)
const {
806 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
809 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
819 if (HasFP32DenormalFlush &&
ST->hasFractBug() && !
ST->hasFastFMAF32() &&
825 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
830 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
843 Type *Ty = Src->getType();
847 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
850 Value *InputScaleFactor =
857 Value *OutputScaleFactor =
859 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
870 Type *Ty = Src->getType();
874 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
875 Constant *One = ConstantFP::get(Ty, 1.0);
876 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
878 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
885 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
887 return Builder.
CreateFMul(Rsq, OutputScaleFactor);
890bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(
const FPMathOperator *SqrtOp,
898 return SqrtFMF.
approxFunc() || HasUnsafeFPMath ||
902Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
911 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
917 bool IsNegative =
false;
926 canIgnoreDenormalInput(Den, CtxI)) {
953 if (
const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
954 bool IsNegative =
false;
959 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
980 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
989 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
994 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
1008Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1011 if (ReqdAccuracy < 2.5f)
1017 bool NumIsOne =
false;
1018 if (
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1019 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1027 if (!HasFP32DenormalFlush && !NumIsOne)
1030 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1033Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1036 float ReqdDivAccuracy)
const {
1039 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1044 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1052 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1056 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1075 if (DisableFDivExpand)
1094 Value *RsqOp =
nullptr;
1095 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1096 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1097 DenII->hasOneUse()) {
1098 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1100 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1113 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.
approxFunc();
1114 if (!RsqOp && AllowInaccurateRcp)
1118 if (ReqdAccuracy < 1.0f)
1135 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
1136 Value *NumElt = NumVals[
I];
1137 Value *DenElt = DenVals[
I];
1138 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
1141 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1142 cast<Instruction>(FPOp), ReqdAccuracy);
1149 if (
auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1150 NewEltInst->copyMetadata(FDiv);
1153 ResultVals[
I] = NewElt;
1168 Attribute Attr =
F.getFnAttribute(
"unsafe-fp-math");
1183 return std::pair(
Lo,
Hi);
1194 Value *Den,
unsigned AtLeast,
1195 bool IsSigned)
const {
1198 if (LHSSignBits < AtLeast)
1202 if (RHSSignBits < AtLeast)
1205 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1216 Value *Den,
bool IsDiv,
1217 bool IsSigned)
const {
1220 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1221 int DivBits = getDivNumBits(
I, Num, Den, AtLeast, IsSigned);
1224 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1227Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1229 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1276 auto FMAD = !
ST->hasMadMacF32Insts()
1280 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1308 if (DivBits != 0 && DivBits < 32) {
1311 int InRegBits = 32 - DivBits;
1313 Res = Builder.
CreateShl(Res, InRegBits);
1317 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1318 Res = Builder.
CreateAnd(Res, TruncMask);
1329bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(
BinaryOperator &
I,
1332 if (
Constant *
C = dyn_cast<Constant>(Den)) {
1335 if (
C->getType()->getScalarSizeInBits() <= 32)
1351 if (BinOpDen->getOpcode() == Instruction::Shl &&
1352 isa<Constant>(BinOpDen->getOperand(0)) &&
1376 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1377 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1383 if (divHasSpecialOptimization(
I,
X,
Y))
1386 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1387 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1389 Type *Ty =
X->getType();
1403 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1411 Value *Sign =
nullptr;
1416 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1460 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1500 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1505 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1506 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1508 int NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1509 if (NumDivBits == -1)
1512 Value *Narrowed =
nullptr;
1513 if (NumDivBits <= 24) {
1514 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1516 }
else if (NumDivBits <= 32) {
1517 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1528void AMDGPUCodeGenPrepareImpl::expandDivRem64(
BinaryOperator &
I)
const {
1531 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1536 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1544bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(
BinaryOperator &
I) {
1545 if (foldBinOpIntoSelect(
I))
1548 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1549 UA->isUniform(&
I) && promoteUniformOpToI32(
I))
1552 if (UseMul24Intrin && replaceMulWithMul24(
I))
1555 bool Changed =
false;
1557 Type *Ty =
I.getType();
1558 Value *NewDiv =
nullptr;
1563 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1564 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1566 !DisableIDivExpand) {
1567 Value *Num =
I.getOperand(0);
1568 Value *Den =
I.getOperand(1);
1572 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1575 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1580 if (ScalarSize <= 32) {
1581 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1583 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1587 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1592 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1593 Div64ToExpand.
push_back(cast<BinaryOperator>(NewElt));
1600 if (ScalarSize <= 32)
1601 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1603 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1610 I.replaceAllUsesWith(NewDiv);
1611 I.eraseFromParent();
1616 if (ExpandDiv64InIR) {
1619 expandDivRem64(*Div);
1628bool AMDGPUCodeGenPrepareImpl::visitLoadInst(
LoadInst &
I) {
1634 canWidenScalarExtLoad(
I)) {
1644 if (
auto *Range = WidenLoad->
getMetadata(LLVMContext::MD_range)) {
1646 mdconst::extract<ConstantInt>(
Range->getOperand(0));
1648 if (
Lower->isNullValue()) {
1649 WidenLoad->
setMetadata(LLVMContext::MD_range,
nullptr);
1666 I.replaceAllUsesWith(ValOrig);
1667 I.eraseFromParent();
1674bool AMDGPUCodeGenPrepareImpl::visitICmpInst(
ICmpInst &
I) {
1675 bool Changed =
false;
1677 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getOperand(0)->getType()) &&
1679 Changed |= promoteUniformOpToI32(
I);
1684bool AMDGPUCodeGenPrepareImpl::visitSelectInst(
SelectInst &
I) {
1691 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType())) {
1692 if (UA->isUniform(&
I))
1693 return promoteUniformOpToI32(
I);
1708 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1709 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1711 Value *Fract =
nullptr;
1712 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1713 CmpVal == matchFractPat(*IIFalse)) {
1715 Fract = applyFractPat(Builder, CmpVal);
1716 }
else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1717 CmpVal == matchFractPat(*IITrue)) {
1719 Fract = applyFractPat(Builder, CmpVal);
1724 I.replaceAllUsesWith(Fract);
1730 const auto *IA = dyn_cast<Instruction>(
A);
1731 const auto *IB = dyn_cast<Instruction>(
B);
1732 return IA && IB && IA->getParent() == IB->getParent();
1738 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1742 const Value *CurVal = V;
1745 BitVector EltsCovered(FVT->getNumElements());
1746 while (
const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1747 const auto *
Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1752 if (!
Idx ||
Idx->getSExtValue() >= FVT->getNumElements())
1755 const auto *VecSrc = IE->getOperand(0);
1760 if (isa<Instruction>(VecSrc) && !
areInSameBB(VecSrc, IE))
1764 EltsCovered.
set(
Idx->getSExtValue());
1767 if (EltsCovered.
all())
1776 if (isa<Constant>(CurVal))
1783 if (
const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1784 return isa<Constant>(SV->getOperand(1)) ||
1794 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1798 for (
const Value *Inc :
I.incoming_values()) {
1799 if (
const auto *PhiInc = dyn_cast<PHINode>(Inc))
1803 for (
const User *U :
I.users()) {
1804 if (
const auto *PhiU = dyn_cast<PHINode>(U))
1809bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1811 if (
const auto It = BreakPhiNodesCache.find(&
I);
1812 It != BreakPhiNodesCache.end())
1827 for (
const PHINode *WLP : WorkList) {
1828 assert(BreakPhiNodesCache.count(WLP) == 0);
1843 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1844 unsigned NumBreakablePHIs = 0;
1845 bool CanBreak =
false;
1846 for (
const PHINode *Cur : WorkList) {
1854 if (++NumBreakablePHIs >= Threshold) {
1861 for (
const PHINode *Cur : WorkList)
1862 BreakPhiNodesCache[Cur] = CanBreak;
1911 Value *&Res = SlicedVals[{BB, Inc}];
1916 if (
Instruction *IncInst = dyn_cast<Instruction>(Inc))
1917 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1923 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1925 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1934bool AMDGPUCodeGenPrepareImpl::visitPHINode(
PHINode &
I) {
1950 DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1953 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1956 std::vector<VectorSlice> Slices;
1963 const unsigned EltSize =
DL->getTypeSizeInBits(EltTy);
1965 if (EltSize == 8 || EltSize == 16) {
1966 const unsigned SubVecSize = (32 / EltSize);
1970 Slices.emplace_back(SubVecTy,
Idx, SubVecSize);
1974 for (;
Idx < NumElts; ++
Idx)
1975 Slices.emplace_back(EltTy,
Idx, 1);
1978 assert(Slices.size() > 1);
1984 B.SetCurrentDebugLocation(
I.getDebugLoc());
1986 unsigned IncNameSuffix = 0;
1990 B.SetInsertPoint(
I.getParent()->getFirstNonPHI());
1991 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1994 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(
Idx),
1995 "largephi.extractslice" +
1996 std::to_string(IncNameSuffix++)),
2003 unsigned NameSuffix = 0;
2005 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
2008 B.CreateInsertVector(FVT, Vec, S.NewPHI,
B.getInt64(S.Idx), ValName);
2010 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2013 I.replaceAllUsesWith(Vec);
2014 I.eraseFromParent();
2028 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2032 if (
const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2044 const auto NullVal =
TM.getNullPointerValue(AS);
2045 assert((NullVal == 0 || NullVal == -1) &&
2046 "don't know how to check for this null value!");
2047 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2054 if (
I.getType()->isVectorTy())
2059 const unsigned SrcAS =
I.getSrcAddressSpace();
2060 const unsigned DstAS =
I.getDestAddressSpace();
2062 bool CanLower =
false;
2080 auto *Intrin =
B.CreateIntrinsic(
2081 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2082 I.replaceAllUsesWith(Intrin);
2083 I.eraseFromParent();
2087bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(
IntrinsicInst &
I) {
2088 switch (
I.getIntrinsicID()) {
2089 case Intrinsic::bitreverse:
2090 return visitBitreverseIntrinsicInst(
I);
2091 case Intrinsic::minnum:
2092 return visitMinNum(
I);
2093 case Intrinsic::sqrt:
2094 return visitSqrt(
I);
2100bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(
IntrinsicInst &
I) {
2101 bool Changed =
false;
2103 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
2105 Changed |= promoteUniformBitreverseToI32(
I);
2116 if (
ST->hasFractBug())
2119 if (
I.getIntrinsicID() != Intrinsic::minnum)
2122 Type *Ty =
I.getType();
2126 Value *Arg0 =
I.getArgOperand(0);
2127 Value *Arg1 =
I.getArgOperand(1);
2135 One.convert(
C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2144 m_Intrinsic<Intrinsic::floor>(
m_Deferred(FloorSrc)))))
2157 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2166 Value *FractArg = matchFractPat(
I);
2172 if (!
I.hasNoNaNs() &&
2181 Value *Fract = applyFractPat(Builder, FractArg);
2183 I.replaceAllUsesWith(Fract);
2195bool AMDGPUCodeGenPrepareImpl::visitSqrt(
IntrinsicInst &Sqrt) {
2211 if (ReqdAccuracy < 1.0f)
2219 if (FDiv && FDiv->
getOpcode() == Instruction::FDiv &&
2220 FDiv->getFPAccuracy() >= 1.0f &&
2227 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2231 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2239 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2241 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2243 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2253bool AMDGPUCodeGenPrepare::doInitialization(
Module &M) {
2255 Impl.DL = &Impl.Mod->getDataLayout();
2256 Impl.SqrtF32 =
nullptr;
2257 Impl.LdexpF32 =
nullptr;
2261bool AMDGPUCodeGenPrepare::runOnFunction(
Function &
F) {
2262 if (skipFunction(
F))
2265 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2271 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2273 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2274 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2275 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2276 Impl.DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2279 Impl.HasFP32DenormalFlush =
2286 AMDGPUCodeGenPrepareImpl Impl;
2287 Impl.Mod =
F.getParent();
2288 Impl.DL = &Impl.Mod->getDataLayout();
2297 Impl.HasFP32DenormalFlush =
2300 if (!Impl.FlowChanged)
2306 "AMDGPU IR optimizations",
false,
false)
2313char AMDGPUCodeGenPrepare::
ID = 0;
2316 return new AMDGPUCodeGenPrepare();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool hasUnsafeFPMath(const Function &F)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
support::ulittle16_t & Lo
support::ulittle16_t & Hi
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
ConstantFP - Floating Point Values [float, double].
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
This instruction compares its operands according to the predicate given to the constructor.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="")
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitPHINode(PHINode &I)
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
RetTy visitBinaryOperator(BinaryOperator &I)
RetTy visitICmpInst(ICmpInst &I)
RetTy visitSelectInst(SelectInst &I)
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
A Module instance is used to store all the information related to an LLVM module.
LLVMContext & getContext() const
Get the global data context.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
static IntegerType * getInt32Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
CmpClass_match< LHS, RHS, FCmpInst, FCmpInst::Predicate > m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
bool isKnownNeverNaN(const Value *V, unsigned Depth, const SimplifyQuery &SQ)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, unsigned Depth, const SimplifyQuery &SQ)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
CGPassBuilderOption getCGPassBuilderOption()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.