26#include "llvm/IR/IntrinsicsAMDGPU.h"
32#define DEBUG_TYPE "amdgpu-codegenprepare"
39 "amdgpu-codegenprepare-widen-constant-loads",
40 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45 "amdgpu-codegenprepare-widen-16-bit-ops",
46 cl::desc(
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51 "amdgpu-codegenprepare-mul24",
52 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
58 "amdgpu-codegenprepare-expand-div64",
59 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
66 "amdgpu-codegenprepare-disable-idiv-expansion",
67 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
79 bool HasUnsafeFPMath =
false;
80 bool HasFP32Denormals =
false;
87 unsigned getBaseElementBitWidth(
const Type *
T)
const;
104 bool needsPromotionToI32(
const Type *
T)
const;
126 bool promoteUniformOpToI32(
ICmpInst &
I)
const;
155 unsigned numBitsUnsigned(
Value *Op)
const;
160 unsigned numBitsSigned(
Value *Op)
const;
174 unsigned AtLeast,
bool Signed)
const;
179 bool IsDiv,
bool IsSigned)
const;
183 bool IsDiv,
bool IsSigned)
const;
201 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
230 if (!ExpandDiv64InIR)
237unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(
const Type *
T)
const {
238 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
240 if (
T->isIntegerTy())
241 return T->getIntegerBitWidth();
242 return cast<VectorType>(
T)->getElementType()->getIntegerBitWidth();
246 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
248 if (
T->isIntegerTy())
249 return B.getInt32Ty();
254 return I.getOpcode() == Instruction::AShr ||
255 I.getOpcode() == Instruction::SDiv ||
I.getOpcode() == Instruction::SRem;
258bool AMDGPUCodeGenPrepare::isSigned(
const SelectInst &
I)
const {
259 return isa<ICmpInst>(
I.getOperand(0)) ?
260 cast<ICmpInst>(
I.getOperand(0))->isSigned() :
false;
263bool AMDGPUCodeGenPrepare::needsPromotionToI32(
const Type *
T)
const {
271 if (
const VectorType *VT = dyn_cast<VectorType>(
T)) {
274 if (
ST->hasVOP3PInsts())
277 return needsPromotionToI32(VT->getElementType());
285 switch (
I.getOpcode()) {
286 case Instruction::Shl:
287 case Instruction::Add:
288 case Instruction::Sub:
290 case Instruction::Mul:
291 return I.hasNoUnsignedWrap();
299 switch (
I.getOpcode()) {
300 case Instruction::Shl:
301 case Instruction::Add:
302 case Instruction::Mul:
304 case Instruction::Sub:
305 return I.hasNoUnsignedWrap();
311bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(
LoadInst &
I)
const {
312 Type *Ty =
I.getType();
314 int TySize =
DL.getTypeSizeInBits(Ty);
315 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
317 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&
I);
320bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(
BinaryOperator &
I)
const {
321 assert(needsPromotionToI32(
I.getType()) &&
322 "I does not need promotion to i32");
324 if (
I.getOpcode() == Instruction::SDiv ||
325 I.getOpcode() == Instruction::UDiv ||
326 I.getOpcode() == Instruction::SRem ||
327 I.getOpcode() == Instruction::URem)
331 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
333 Type *I32Ty = getI32Ty(Builder,
I.getType());
334 Value *ExtOp0 =
nullptr;
335 Value *ExtOp1 =
nullptr;
336 Value *ExtRes =
nullptr;
337 Value *TruncRes =
nullptr;
340 ExtOp0 =
Builder.CreateSExt(
I.getOperand(0), I32Ty);
341 ExtOp1 =
Builder.CreateSExt(
I.getOperand(1), I32Ty);
343 ExtOp0 =
Builder.CreateZExt(
I.getOperand(0), I32Ty);
344 ExtOp1 =
Builder.CreateZExt(
I.getOperand(1), I32Ty);
347 ExtRes =
Builder.CreateBinOp(
I.getOpcode(), ExtOp0, ExtOp1);
348 if (
Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
350 Inst->setHasNoSignedWrap();
353 Inst->setHasNoUnsignedWrap();
355 if (
const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&
I))
356 Inst->setIsExact(ExactOp->isExact());
359 TruncRes =
Builder.CreateTrunc(ExtRes,
I.getType());
361 I.replaceAllUsesWith(TruncRes);
367bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(
ICmpInst &
I)
const {
368 assert(needsPromotionToI32(
I.getOperand(0)->getType()) &&
369 "I does not need promotion to i32");
372 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
374 Type *I32Ty = getI32Ty(Builder,
I.getOperand(0)->getType());
375 Value *ExtOp0 =
nullptr;
376 Value *ExtOp1 =
nullptr;
377 Value *NewICmp =
nullptr;
380 ExtOp0 =
Builder.CreateSExt(
I.getOperand(0), I32Ty);
381 ExtOp1 =
Builder.CreateSExt(
I.getOperand(1), I32Ty);
383 ExtOp0 =
Builder.CreateZExt(
I.getOperand(0), I32Ty);
384 ExtOp1 =
Builder.CreateZExt(
I.getOperand(1), I32Ty);
386 NewICmp =
Builder.CreateICmp(
I.getPredicate(), ExtOp0, ExtOp1);
388 I.replaceAllUsesWith(NewICmp);
394bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(
SelectInst &
I)
const {
395 assert(needsPromotionToI32(
I.getType()) &&
396 "I does not need promotion to i32");
399 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
401 Type *I32Ty = getI32Ty(Builder,
I.getType());
402 Value *ExtOp1 =
nullptr;
403 Value *ExtOp2 =
nullptr;
404 Value *ExtRes =
nullptr;
405 Value *TruncRes =
nullptr;
408 ExtOp1 =
Builder.CreateSExt(
I.getOperand(1), I32Ty);
409 ExtOp2 =
Builder.CreateSExt(
I.getOperand(2), I32Ty);
411 ExtOp1 =
Builder.CreateZExt(
I.getOperand(1), I32Ty);
412 ExtOp2 =
Builder.CreateZExt(
I.getOperand(2), I32Ty);
414 ExtRes =
Builder.CreateSelect(
I.getOperand(0), ExtOp1, ExtOp2);
415 TruncRes =
Builder.CreateTrunc(ExtRes,
I.getType());
417 I.replaceAllUsesWith(TruncRes);
423bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
425 assert(
I.getIntrinsicID() == Intrinsic::bitreverse &&
426 "I must be bitreverse intrinsic");
427 assert(needsPromotionToI32(
I.getType()) &&
428 "I does not need promotion to i32");
431 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
433 Type *I32Ty = getI32Ty(Builder,
I.getType());
439 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(
I.getType()));
441 Builder.CreateTrunc(LShrOp,
I.getType());
443 I.replaceAllUsesWith(TruncRes);
449unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(
Value *Op)
const {
453unsigned AMDGPUCodeGenPrepare::numBitsSigned(
Value *Op)
const {
459 auto *VT = dyn_cast<FixedVectorType>(V->getType());
465 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
478 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
479 NewVal =
Builder.CreateInsertElement(NewVal, Values[
I],
I);
488 unsigned Size,
unsigned NumBits,
bool IsSigned) {
489 if (
Size <= 32 || NumBits <= 32) {
491 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
498 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
500 IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
512bool AMDGPUCodeGenPrepare::replaceMulWithMul24(
BinaryOperator &
I)
const {
513 if (
I.getOpcode() != Instruction::Mul)
516 Type *Ty =
I.getType();
518 if (Size <= 16 && ST->has16BitInsts())
522 if (UA->isUniform(&
I))
528 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
530 unsigned LHSBits = 0, RHSBits = 0;
531 bool IsSigned =
false;
533 if (
ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
534 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
537 }
else if (
ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
538 (RHSBits = numBitsSigned(RHS)) <= 24) {
551 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
554 LHS =
Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty);
555 RHS =
Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty);
557 LHS =
Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
558 RHS =
Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
562 getMul24(Builder, LHS, RHS,
Size, LHSBits + RHSBits, IsSigned);
566 Builder.CreateSExtOrTrunc(Result, LHSVals[
I]->getType()));
569 Builder.CreateZExtOrTrunc(Result, LHSVals[
I]->getType()));
575 I.replaceAllUsesWith(NewVal);
585 if (
SelectInst *Sel = dyn_cast<SelectInst>(V))
588 if ((Cast = dyn_cast<CastInst>(V))) {
596bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(
BinaryOperator &BO)
const {
617 if (!CBO || !CT || !CF)
632 if (!FoldedT || isa<ConstantExpr>(FoldedT))
638 if (!FoldedF || isa<ConstantExpr>(FoldedF))
643 if (
const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
644 Builder.setFastMathFlags(FPOp->getFastMathFlags());
667 if (!AllowInaccurateRcp && !RcpIsAccurate)
671 if (
const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
672 if (AllowInaccurateRcp || RcpIsAccurate) {
673 if (CLHS->isExactlyValue(1.0)) {
675 Mod, Intrinsic::amdgcn_rcp, Ty);
688 return Builder.CreateCall(Decl, { Den });
692 if (CLHS->isExactlyValue(-1.0)) {
694 Mod, Intrinsic::amdgcn_rcp, Ty);
698 return Builder.CreateCall(Decl, { FNeg });
703 if (AllowInaccurateRcp) {
705 Mod, Intrinsic::amdgcn_rcp, Ty);
710 return Builder.CreateFMul(Num, Recip);
726 if (ReqdAccuracy < 2.5f)
734 bool NumIsOne =
false;
735 if (
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
736 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
741 if (HasDenormals && !NumIsOne)
745 return Builder.CreateCall(Decl, { Num, Den });
781 const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.
approxFunc();
786 const bool RcpIsAccurate = (Ty->
isHalfTy() && ReqdAccuracy >= 1.0f) ||
787 (Ty->
isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
796 Value *NewFDiv =
nullptr;
797 if (
auto *VT = dyn_cast<FixedVectorType>(FDiv.
getType())) {
802 for (
unsigned I = 0,
E = VT->getNumElements();
I !=
E; ++
I) {
807 RcpIsAccurate, Builder,
Mod);
810 HasFP32Denormals, Builder,
Mod);
812 NewElt =
Builder.CreateFDiv(NumEltI, DenEltI);
814 NewFDiv =
Builder.CreateInsertElement(NewFDiv, NewElt,
I);
837 IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(
I.getOperand(0));
839 if (!RHS || !IntrinsicCall ||
RHS->getSExtValue() != -1)
840 return visitBinaryOperator(
I);
846 return visitBinaryOperator(
I);
851 return visitBinaryOperator(
I);
855 I.replaceAllUsesWith(IntrinsicCall);
861 Attribute Attr =
F.getFnAttribute(
"unsafe-fp-math");
876 return std::pair(
Lo,
Hi);
888 unsigned AtLeast,
bool IsSigned)
const {
891 if (LHSSignBits < AtLeast)
895 if (RHSSignBits < AtLeast)
898 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
910 bool IsDiv,
bool IsSigned)
const {
911 int DivBits = getDivNumBits(
I, Num, Den, 9, IsSigned);
914 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
921 bool IsDiv,
bool IsSigned)
const {
923 Num =
Builder.CreateTrunc(Num, I32Ty);
924 Den =
Builder.CreateTrunc(Den, I32Ty);
932 JQ =
Builder.CreateXor(Num, Den);
938 JQ =
Builder.CreateOr(JQ, One);
949 :
Builder.CreateUIToFP(IA, F32Ty);
953 :
Builder.CreateUIToFP(IB,F32Ty);
968 auto FMAD = !
ST->hasMadMacF32Insts()
972 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
976 :
Builder.CreateFPToUI(FQ, I32Ty);
979 FR =
Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
982 FB =
Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
997 Res =
Builder.CreateSub(Num, Rem);
1000 if (DivBits != 0 && DivBits < 32) {
1003 int InRegBits = 32 - DivBits;
1005 Res =
Builder.CreateShl(Res, InRegBits);
1006 Res =
Builder.CreateAShr(Res, InRegBits);
1009 =
Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1010 Res =
Builder.CreateAnd(Res, TruncMask);
1021bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
1023 if (
Constant *
C = dyn_cast<Constant>(Den)) {
1026 if (
C->getType()->getScalarSizeInBits() <= 32)
1042 if (BinOpDen->getOpcode() == Instruction::Shl &&
1043 isa<Constant>(BinOpDen->getOperand(0)) &&
1067 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1068 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1072 Builder.setFastMathFlags(FMF);
1074 if (divHasSpecialOptimization(
I,
X,
Y))
1077 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1078 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1080 Type *Ty =
X->getType();
1094 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1095 return IsSigned ?
Builder.CreateSExtOrTrunc(Res, Ty) :
1096 Builder.CreateZExtOrTrunc(Res, Ty);
1102 Value *Sign =
nullptr;
1107 Sign = IsDiv ?
Builder.CreateXor(SignX, SignY) : SignX;
1179 Res =
Builder.CreateXor(Res, Sign);
1180 Res =
Builder.CreateSub(Res, Sign);
1183 Res =
Builder.CreateTrunc(Res, Ty);
1191 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1196 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1197 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1199 int NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1200 if (NumDivBits == -1)
1203 Value *Narrowed =
nullptr;
1204 if (NumDivBits <= 24) {
1205 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1207 }
else if (NumDivBits <= 32) {
1208 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1222 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1227 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1236 if (foldBinOpIntoSelect(
I))
1239 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1240 UA->isUniform(&
I) && promoteUniformOpToI32(
I))
1243 if (UseMul24Intrin && replaceMulWithMul24(
I))
1246 bool Changed =
false;
1248 Type *Ty =
I.getType();
1249 Value *NewDiv =
nullptr;
1254 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1255 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1257 !DisableIDivExpand) {
1258 Value *Num =
I.getOperand(0);
1259 Value *Den =
I.getOperand(1);
1261 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
1263 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1266 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1271 if (ScalarSize <= 32) {
1272 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1274 NewElt =
Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1278 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1283 NewElt =
Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1284 Div64ToExpand.
push_back(cast<BinaryOperator>(NewElt));
1288 NewDiv =
Builder.CreateInsertElement(NewDiv, NewElt,
N);
1291 if (ScalarSize <= 32)
1292 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1294 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1301 I.replaceAllUsesWith(NewDiv);
1302 I.eraseFromParent();
1307 if (ExpandDiv64InIR) {
1310 expandDivRem64(*Div);
1318bool AMDGPUCodeGenPrepare::visitLoadInst(
LoadInst &
I) {
1324 canWidenScalarExtLoad(
I)) {
1326 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
1329 Type *PT = PointerType::get(I32Ty,
I.getPointerAddressSpace());
1330 Value *BitCast=
Builder.CreateBitCast(
I.getPointerOperand(), PT);
1336 if (
auto *Range = WidenLoad->
getMetadata(LLVMContext::MD_range)) {
1338 mdconst::extract<ConstantInt>(
Range->getOperand(0));
1340 if (
Lower->isNullValue()) {
1341 WidenLoad->
setMetadata(LLVMContext::MD_range,
nullptr);
1356 Value *ValTrunc =
Builder.CreateTrunc(WidenLoad, IntNTy);
1357 Value *ValOrig =
Builder.CreateBitCast(ValTrunc,
I.getType());
1358 I.replaceAllUsesWith(ValOrig);
1359 I.eraseFromParent();
1366bool AMDGPUCodeGenPrepare::visitICmpInst(
ICmpInst &
I) {
1367 bool Changed =
false;
1369 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getOperand(0)->getType()) &&
1371 Changed |= promoteUniformOpToI32(
I);
1376bool AMDGPUCodeGenPrepare::visitSelectInst(
SelectInst &
I) {
1377 bool Changed =
false;
1379 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1381 Changed |= promoteUniformOpToI32(
I);
1387 switch (
I.getIntrinsicID()) {
1388 case Intrinsic::bitreverse:
1389 return visitBitreverseIntrinsicInst(
I);
1395bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(
IntrinsicInst &
I) {
1396 bool Changed =
false;
1398 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1400 Changed |= promoteUniformBitreverseToI32(
I);
1405bool AMDGPUCodeGenPrepare::doInitialization(
Module &M) {
1411bool AMDGPUCodeGenPrepare::runOnFunction(
Function &
F) {
1412 if (skipFunction(
F))
1415 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1421 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
1422 UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
1424 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1425 DT = DTWP ? &DTWP->getDomTree() :
nullptr;
1430 HasFP32Denormals =
Mode.allFP32Denormals();
1432 bool MadeChange =
false;
1437 NextBB = std::next(FI);
1441 Next = std::next(
I);
1443 MadeChange |= visit(*
I);
1447 if (NextInstBB != BB) {
1460 "AMDGPU IR optimizations",
false,
false)
1466char AMDGPUCodeGenPrepare::
ID = 0;
1469 return new AMDGPUCodeGenPrepare();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, bool HasDenormals, IRBuilder<> &Builder, Module *Mod)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool hasUnsafeFPMath(const Function &F)
static Value * getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, unsigned Size, unsigned NumBits, bool IsSigned)
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
static Value * optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, bool RcpIsAccurate, IRBuilder<> &Builder, Module *Mod)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
The AMDGPU TargetMachine interface definition for hw codegen targets.
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Statically lint checks LLVM IR
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
BinaryOps getOpcode() const
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
ConstantFP - Floating Point Values [float, double].
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
This is the shared class of boolean and integer constants.
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitBinaryOperator(BinaryOperator &I)
RetTy visitICmpInst(ICmpInst &I)
RetTy visitSelectInst(SelectInst &I)
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const BasicBlock * getParent() const
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
A Module instance is used to store all the information related to an LLVM module.
LLVMContext & getContext() const
Get the global data context.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
void takeName(Value *V)
Transfer the name from V to this value.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
FunctionPass * createAMDGPUCodeGenPreparePass()
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
This struct is a compact representation of a valid (non-zero power of two) alignment.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
bool isNegative() const
Returns true if this value is known to be negative.