LLVM 23.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
18#include "llvm/ADT/SetVector.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/IR/ValueHandle.h"
33#include "llvm/Pass.h"
39
40#define DEBUG_TYPE "amdgpu-codegenprepare"
41
42using namespace llvm;
43using namespace llvm::PatternMatch;
44
45namespace {
46
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
51 cl::init(false));
52
53static cl::opt<bool>
54 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
55 cl::desc("Break large PHI nodes for DAGISel"),
57
58static cl::opt<bool>
59 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc("For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
63
64static cl::opt<unsigned> BreakLargePHIsThreshold(
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68
69static cl::opt<bool> UseMul24Intrin(
70 "amdgpu-codegenprepare-mul24",
71 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::init(true));
74
75// Legalize 64-bit division by using the generic IR expansion.
76static cl::opt<bool> ExpandDiv64InIR(
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::init(false));
81
82// Leave all division operations as they are. This supersedes ExpandDiv64InIR
83// and is used for testing the legalizer.
84static cl::opt<bool> DisableIDivExpand(
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::init(false));
89
90// Disable processing of fdiv so we can better test the backend implementations.
91static cl::opt<bool> DisableFDivExpand(
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::init(false));
96
97class AMDGPUCodeGenPrepareImpl
98 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
99public:
100 Function &F;
101 const GCNSubtarget &ST;
102 const AMDGPUTargetMachine &TM;
103 const TargetLibraryInfo *TLI;
104 const UniformityInfo &UA;
105 const DataLayout &DL;
106 SimplifyQuery SQ;
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
111 mutable SmallVector<WeakVH> DeadVals;
112
113 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
114
115 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116 const TargetLibraryInfo *TLI, AssumptionCache *AC,
117 const DominatorTree *DT, const UniformityInfo &UA)
118 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
119 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
120 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
127 LLVMContext &Ctx = F.getContext();
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
137 LLVMContext &Ctx = F.getContext();
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146 /// Return true if \p T is a legal scalar floating point type.
147 bool isLegalFloatingTy(const Type *T) const;
148
149 /// Wrapper to pass all the arguments to computeKnownFPClass
151 const Instruction *CtxI) const {
152 return llvm::computeKnownFPClass(V, Interested,
153 SQ.getWithInstruction(CtxI));
154 }
155
156 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
157 return HasFP32DenormalFlush ||
159 }
160
161 /// \returns The minimum number of bits needed to store the value of \Op as an
162 /// unsigned integer. Truncating to this size and then zero-extending to
163 /// the original will not change the value.
164 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
165
166 /// \returns The minimum number of bits needed to store the value of \Op as a
167 /// signed integer. Truncating to this size and then sign-extending to
168 /// the original size will not change the value.
169 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
170
171 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
172 /// SelectionDAG has an issue where an and asserting the bits are known
173 bool replaceMulWithMul24(BinaryOperator &I) const;
174
175 /// Perform same function as equivalently named function in DAGCombiner. Since
176 /// we expand some divisions here, we need to perform this before obscuring.
177 bool foldBinOpIntoSelect(BinaryOperator &I) const;
178
179 bool divHasSpecialOptimization(BinaryOperator &I,
180 Value *Num, Value *Den) const;
181 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
182 unsigned MaxDivBits, bool Signed) const;
183
184 /// Expands 24 bit div or rem.
185 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
186 Value *Num, Value *Den,
187 bool IsDiv, bool IsSigned) const;
188
189 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den, unsigned NumBits,
191 bool IsDiv, bool IsSigned) const;
192
193 /// Expands 32 bit div or rem.
194 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
195 Value *Num, Value *Den) const;
196
197 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
198 Value *Num, Value *Den) const;
199 void expandDivRem64(BinaryOperator &I) const;
200
201 /// Widen a scalar load.
202 ///
203 /// \details \p Widen scalar load for uniform, small type loads from constant
204 // memory / to a full 32-bits and then truncate the input to allow a scalar
205 // load instead of a vector load.
206 //
207 /// \returns True.
208
209 bool canWidenScalarExtLoad(LoadInst &I) const;
210
211 Value *matchFractPatImpl(Value &V, const APFloat &C) const;
212 Value *matchFractPatNanAvoidant(Value &V);
213 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
214
215 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
216
217 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
218 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
219 const Instruction *CtxI) const;
220
221 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
222 FastMathFlags FMF, const Instruction *CtxI) const;
223 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
224 float ReqdAccuracy) const;
225
226 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
227 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
228 Value *RsqOp, const Instruction *FDiv,
229 float ReqdAccuracy) const;
230
231 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
232 Value *Src) const;
233
234 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
235 bool IsNegative) const;
236 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
237 FastMathFlags FMF) const;
238 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
239 FastMathFlags FMF) const;
240 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
241 FastMathFlags DivFMF, const Instruction *CtxI,
242 bool IsNegative) const;
243
244 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
245 void replaceWithWorkitemIdX(Instruction &I) const;
246 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
247 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
248
249 bool tryNarrowMathIfNoOverflow(Instruction *I);
250
251public:
252 bool visitFDiv(BinaryOperator &I);
253
254 bool visitInstruction(Instruction &I) { return false; }
255 bool visitBinaryOperator(BinaryOperator &I);
256 bool visitLoadInst(LoadInst &I);
257 bool visitSelectInst(SelectInst &I);
258 bool visitPHINode(PHINode &I);
259 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
260
261 bool visitIntrinsicInst(IntrinsicInst &I);
262 bool visitFMinLike(IntrinsicInst &I);
263 bool visitSqrt(IntrinsicInst &I);
264 bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
265 bool visitMbcntLo(IntrinsicInst &I) const;
266 bool visitMbcntHi(IntrinsicInst &I) const;
267 bool visitVectorReduceAdd(IntrinsicInst &I);
268 bool visitSaturatingAdd(IntrinsicInst &I);
269 bool run();
270};
271
272class AMDGPUCodeGenPrepare : public FunctionPass {
273public:
274 static char ID;
275 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
276 void getAnalysisUsage(AnalysisUsage &AU) const override {
280
281 // FIXME: Division expansion needs to preserve the dominator tree.
282 if (!ExpandDiv64InIR)
283 AU.setPreservesAll();
284 }
285 bool runOnFunction(Function &F) override;
286 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
287};
288
289} // end anonymous namespace
290
291bool AMDGPUCodeGenPrepareImpl::run() {
292 BreakPhiNodesCache.clear();
293 bool MadeChange = false;
294
295 // Need to use make_early_inc_range because integer division expansion is
296 // handled by Transform/Utils, and it can delete instructions such as the
297 // terminator of the BB.
298 for (BasicBlock &BB : reverse(F)) {
299 for (Instruction &I : make_early_inc_range(reverse(BB))) {
300 if (!isInstructionTriviallyDead(&I, TLI))
301 MadeChange |= visit(I);
302 }
303 }
304
305 while (!DeadVals.empty()) {
306 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
308 }
309
310 return MadeChange;
311}
312
313bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
314 return Ty->isFloatTy() || Ty->isDoubleTy() ||
315 (Ty->isHalfTy() && ST.has16BitInsts());
316}
317
318bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
319 Type *Ty = I.getType();
320 int TySize = DL.getTypeSizeInBits(Ty);
321 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
322
323 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniformAtDef(&I);
324}
325
326unsigned
327AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
328 const Instruction *CtxI) const {
329 return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
330}
331
332unsigned
333AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
334 const Instruction *CtxI) const {
335 return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
336}
337
338static void extractValues(IRBuilder<> &Builder,
339 SmallVectorImpl<Value *> &Values, Value *V) {
340 auto *VT = dyn_cast<FixedVectorType>(V->getType());
341 if (!VT) {
342 Values.push_back(V);
343 return;
344 }
345
346 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
347 Values.push_back(Builder.CreateExtractElement(V, I));
348}
349
351 Type *Ty,
352 SmallVectorImpl<Value *> &Values) {
353 if (!Ty->isVectorTy()) {
354 assert(Values.size() == 1);
355 return Values[0];
356 }
357
358 Value *NewVal = PoisonValue::get(Ty);
359 for (int I = 0, E = Values.size(); I != E; ++I)
360 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
361
362 return NewVal;
363}
364
365bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
366 if (I.getOpcode() != Instruction::Mul)
367 return false;
368
369 Type *Ty = I.getType();
370 unsigned Size = Ty->getScalarSizeInBits();
371 if (Size <= 16 && ST.has16BitInsts())
372 return false;
373
374 // Prefer scalar if this could be s_mul_i32
375 if (UA.isUniformAtDef(&I))
376 return false;
377
378 Value *LHS = I.getOperand(0);
379 Value *RHS = I.getOperand(1);
380 IRBuilder<> Builder(&I);
381 Builder.SetCurrentDebugLocation(I.getDebugLoc());
382
383 unsigned LHSBits = 0, RHSBits = 0;
384 bool IsSigned = false;
385
386 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
387 (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
388 IsSigned = false;
389
390 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
391 (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
392 IsSigned = true;
393
394 } else
395 return false;
396
397 SmallVector<Value *, 4> LHSVals;
398 SmallVector<Value *, 4> RHSVals;
399 SmallVector<Value *, 4> ResultVals;
400 extractValues(Builder, LHSVals, LHS);
401 extractValues(Builder, RHSVals, RHS);
402
403 IntegerType *I32Ty = Builder.getInt32Ty();
404 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
405 Type *DstTy = LHSVals[0]->getType();
406
407 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
408 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
409 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
410 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
411 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
413 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
414 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
415 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
416 : Builder.CreateZExtOrTrunc(Result, DstTy);
417 ResultVals.push_back(Result);
418 }
419
420 Value *NewVal = insertValues(Builder, Ty, ResultVals);
421 NewVal->takeName(&I);
422 I.replaceAllUsesWith(NewVal);
423 DeadVals.push_back(&I);
424
425 return true;
426}
427
428// Find a select instruction, which may have been casted. This is mostly to deal
429// with cases where i16 selects were promoted here to i32.
431 Cast = nullptr;
432 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
433 return Sel;
434
435 if ((Cast = dyn_cast<CastInst>(V))) {
436 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
437 return Sel;
438 }
439
440 return nullptr;
441}
442
443bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
444 // Don't do this unless the old select is going away. We want to eliminate the
445 // binary operator, not replace a binop with a select.
446 int SelOpNo = 0;
447
448 CastInst *CastOp;
449
450 // TODO: Should probably try to handle some cases with multiple
451 // users. Duplicating the select may be profitable for division.
452 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
453 if (!Sel || !Sel->hasOneUse()) {
454 SelOpNo = 1;
455 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
456 }
457
458 if (!Sel || !Sel->hasOneUse())
459 return false;
460
463 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
464 if (!CBO || !CT || !CF)
465 return false;
466
467 if (CastOp) {
468 if (!CastOp->hasOneUse())
469 return false;
470 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
471 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
472 }
473
474 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
475 // need to handle divisions here.
476 Constant *FoldedT =
477 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
478 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
479 if (!FoldedT || isa<ConstantExpr>(FoldedT))
480 return false;
481
482 Constant *FoldedF =
483 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
484 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
485 if (!FoldedF || isa<ConstantExpr>(FoldedF))
486 return false;
487
488 IRBuilder<> Builder(&BO);
489 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
490 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
491 Builder.setFastMathFlags(FPOp->getFastMathFlags());
492
493 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
494 FoldedT, FoldedF);
495 NewSelect->takeName(&BO);
496 BO.replaceAllUsesWith(NewSelect);
497 DeadVals.push_back(&BO);
498 if (CastOp)
499 DeadVals.push_back(CastOp);
500 DeadVals.push_back(Sel);
501 return true;
502}
503
504std::pair<Value *, Value *>
505AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
506 Value *Src) const {
507 Type *Ty = Src->getType();
508 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
509 {Ty, Builder.getInt32Ty()}, Src);
510 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
511
512 // Bypass the bug workaround for the exponent result since it doesn't matter.
513 // TODO: Does the bug workaround even really need to consider the exponent
514 // result? It's unspecified by the spec.
515
516 Value *FrexpExp =
517 ST.hasFractBug()
518 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
519 {Builder.getInt32Ty(), Ty}, Src)
520 : Builder.CreateExtractValue(Frexp, {1});
521 return {FrexpMant, FrexpExp};
522}
523
524/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
525Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
526 Value *Src,
527 bool IsNegative) const {
528 // Same as for 1.0, but expand the sign out of the constant.
529 // -1.0 / x -> rcp (fneg x)
530 if (IsNegative)
531 Src = Builder.CreateFNeg(Src);
532
533 // The rcp instruction doesn't support denormals, so scale the input
534 // out of the denormal range and convert at the end.
535 //
536 // Expand as 2^-n * (1.0 / (x * 2^n))
537
538 // TODO: Skip scaling if input is known never denormal and the input
539 // range won't underflow to denormal. The hard part is knowing the
540 // result. We need a range check, the result could be denormal for
541 // 0x1p+126 < den <= 0x1p+127.
542 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
543 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
544 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
545 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
546}
547
548/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
549Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
550 Value *RHS,
551 FastMathFlags FMF) const {
552 // If we have have to work around the fract/frexp bug, we're worse off than
553 // using the fdiv.fast expansion. The full safe expansion is faster if we have
554 // fast FMA.
555 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
556 (!FMF.noNaNs() || !FMF.noInfs()))
557 return nullptr;
558
559 // We're scaling the LHS to avoid a denormal input, and scale the denominator
560 // to avoid large values underflowing the result.
561 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
562
563 Value *Rcp =
564 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
565
566 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
567 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
568
569 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
570 // result.
571 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
572 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
573}
574
575/// Emit a sqrt that handles denormals and is accurate to 2ulp.
576Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
577 Value *Src,
578 FastMathFlags FMF) const {
579 Type *Ty = Src->getType();
580 APFloat SmallestNormal =
582 Value *NeedScale =
583 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
584
585 ConstantInt *Zero = Builder.getInt32(0);
586 Value *InputScaleFactor =
587 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
588
589 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
590
591 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
592
593 Value *OutputScaleFactor =
594 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
595 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
596}
597
598/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
599static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
600 bool IsNegative) {
601 // bool need_scale = x < 0x1p-126f;
602 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
603 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
604 // rsq(x * input_scale) * output_scale;
605
606 Type *Ty = Src->getType();
607 APFloat SmallestNormal =
608 APFloat::getSmallestNormalized(Ty->getFltSemantics());
609 Value *NeedScale =
610 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
611 Constant *One = ConstantFP::get(Ty, 1.0);
612 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
613 Constant *OutputScale =
614 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
615
616 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
617
618 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
619 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
620 Value *OutputScaleFactor = Builder.CreateSelect(
621 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
622
623 return Builder.CreateFMul(Rsq, OutputScaleFactor);
624}
625
626/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
627/// v_rsq_f64. This should give a 1ulp result.
628Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
629 FastMathFlags SqrtFMF,
630 FastMathFlags DivFMF,
631 const Instruction *CtxI,
632 bool IsNegative) const {
633 // rsq(x):
634 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
635 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
636 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
637 //
638 // -rsq(x):
639 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
640 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
641 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
642 //
643 // The rsq instruction handles the special cases correctly. We need to check
644 // for the edge case conditions to ensure the special case propagates through
645 // the later instructions.
646
647 Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
648
649 // Try to elide the edge case check.
650 //
651 // Fast math flags imply:
652 // sqrt ninf => !isinf(x)
653 // fdiv ninf => x != 0, !isinf(x)
654 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
655 bool MaybeZero = !DivFMF.noInfs();
656
657 DenormalMode DenormMode;
658 FPClassTest Interested = fcNone;
659 if (MaybePosInf)
660 Interested = fcPosInf;
661 if (MaybeZero)
662 Interested |= fcZero;
663
664 if (Interested != fcNone) {
665 KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
666 if (KnownSrc.isKnownNeverPosInfinity())
667 MaybePosInf = false;
668
669 DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
670 if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
671 MaybeZero = false;
672 }
673
674 Value *SpecialOrRsq = X;
675 if (MaybeZero || MaybePosInf) {
676 Value *Cond;
677 if (MaybePosInf && MaybeZero) {
678 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
679 FPClassTest TestMask = fcPosInf | fcZero;
680 if (DenormMode.inputsAreZero())
681 TestMask |= fcSubnormal;
682
683 Cond = Builder.createIsFPClass(X, TestMask);
684 } else {
685 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
686 // doesn't respect the floating-point environment.
687 Value *IsZero =
688 Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
689 Value *IsInf =
690 Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
691 Cond = Builder.CreateOr(IsZero, IsInf);
692 }
693 } else if (MaybeZero) {
694 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
695 } else {
696 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
697 }
698
699 SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
700 }
701
702 Value *NegY0 = Builder.CreateFNeg(Y0);
703 Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
704
705 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
706 Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
707
708 Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
709
710 Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
711 ConstantFP::get(X->getType(), 0.5));
712
713 return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
714}
715
716bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
717 FastMathFlags SqrtFMF) const {
718 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
719 // f64.
720 return DivFMF.allowContract() && SqrtFMF.allowContract();
721}
722
723Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
724 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
725 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
726 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
727 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
728
729 // rsq_f16 is accurate to 0.51 ulp.
730 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
731 // rsq_f64 is never accurate.
732 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
733 if (!CLHS)
734 return nullptr;
735
736 bool IsNegative = false;
737
738 // TODO: Handle other numerator values with arcp.
739 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
740 // Add in the sqrt flags.
741 IRBuilder<>::FastMathFlagGuard Guard(Builder);
742 Builder.setFastMathFlags(DivFMF | SqrtFMF);
743
744 if (Den->getType()->isFloatTy()) {
745 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
746 canIgnoreDenormalInput(Den, CtxI)) {
747 Value *Result =
748 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
749 // -1.0 / sqrt(x) -> fneg(rsq(x))
750 return IsNegative ? Builder.CreateFNeg(Result) : Result;
751 }
752
753 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
754 }
755
756 if (Den->getType()->isDoubleTy())
757 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
758 }
759
760 return nullptr;
761}
762
763// Optimize fdiv with rcp:
764//
765// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
766// allowed with afn.
767//
768// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
769Value *
770AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
771 Value *Den, FastMathFlags FMF,
772 const Instruction *CtxI) const {
773 // rcp_f16 is accurate to 0.51 ulp.
774 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
775 // rcp_f64 is never accurate.
776 assert(Den->getType()->isFloatTy());
777
778 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
779 bool IsNegative = false;
780 if (CLHS->isExactlyValue(1.0) ||
781 (IsNegative = CLHS->isExactlyValue(-1.0))) {
782 Value *Src = Den;
783
784 if (HasFP32DenormalFlush || FMF.approxFunc()) {
785 // -1.0 / x -> 1.0 / fneg(x)
786 if (IsNegative)
787 Src = Builder.CreateFNeg(Src);
788
789 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
790 // the CI documentation has a worst case error of 1 ulp.
791 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
792 // to use it as long as we aren't trying to use denormals.
793 //
794 // v_rcp_f16 and v_rsq_f16 DO support denormals.
795
796 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
797 // insert rsq intrinsic here.
798
799 // 1.0 / x -> rcp(x)
800 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
801 }
802
803 // TODO: If the input isn't denormal, and we know the input exponent isn't
804 // big enough to introduce a denormal we can avoid the scaling.
805 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
806 }
807 }
808
809 if (FMF.allowReciprocal()) {
810 // x / y -> x * (1.0 / y)
811
812 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
813 // will never underflow.
814 if (HasFP32DenormalFlush || FMF.approxFunc()) {
815 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
816 return Builder.CreateFMul(Num, Recip);
817 }
818
819 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
820 return Builder.CreateFMul(Num, Recip);
821 }
822
823 return nullptr;
824}
825
826// optimize with fdiv.fast:
827//
828// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
829//
830// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
831//
832// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
833Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
834 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
835 // fdiv.fast can achieve 2.5 ULP accuracy.
836 if (ReqdAccuracy < 2.5f)
837 return nullptr;
838
839 // Only have fdiv.fast for f32.
840 assert(Den->getType()->isFloatTy());
841
842 bool NumIsOne = false;
843 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
844 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
845 NumIsOne = true;
846 }
847
848 // fdiv does not support denormals. But 1.0/x is always fine to use it.
849 //
850 // TODO: This works for any value with a specific known exponent range, don't
851 // just limit to constant 1.
852 if (!HasFP32DenormalFlush && !NumIsOne)
853 return nullptr;
854
855 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
856}
857
858Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
859 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
860 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
861 float ReqdDivAccuracy) const {
862 if (RsqOp) {
863 Value *Rsq =
864 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
865 if (Rsq)
866 return Rsq;
867 }
868
869 if (!Num->getType()->isFloatTy())
870 return nullptr;
871
872 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
873 if (Rcp)
874 return Rcp;
875
876 // In the basic case fdiv_fast has the same instruction count as the frexp div
877 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
878 // potentially be fused into a user. Also, materialization of the constants
879 // can be reused for multiple instances.
880 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
881 if (FDivFast)
882 return FDivFast;
883
884 return emitFrexpDiv(Builder, Num, Den, DivFMF);
885}
886
887// Optimizations is performed based on fpmath, fast math flags as well as
888// denormals to optimize fdiv with either rcp or fdiv.fast.
889//
890// With rcp:
891// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
892// allowed with afn.
893//
894// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
895//
896// With fdiv.fast:
897// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
898//
899// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
900//
901// NOTE: rcp is the preference in cases that both are legal.
902bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
903 if (DisableFDivExpand)
904 return false;
905
906 Type *Ty = FDiv.getType()->getScalarType();
907 const bool IsFloat = Ty->isFloatTy();
908 if (!IsFloat && !Ty->isDoubleTy())
909 return false;
910
911 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
912 // expansion around them in codegen. f16 is good enough to always use.
913
914 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
915 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
916 const float ReqdAccuracy = FPOp->getFPAccuracy();
917
918 FastMathFlags SqrtFMF;
919
920 Value *Num = FDiv.getOperand(0);
921 Value *Den = FDiv.getOperand(1);
922
923 Value *RsqOp = nullptr;
924 auto *DenII = dyn_cast<IntrinsicInst>(Den);
925 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
926 DenII->hasOneUse()) {
927 const auto *SqrtOp = cast<FPMathOperator>(DenII);
928 SqrtFMF = SqrtOp->getFastMathFlags();
929 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
930 RsqOp = SqrtOp->getOperand(0);
931 }
932
933 // rcp path not yet implemented for f64.
934 if (!IsFloat && !RsqOp)
935 return false;
936
937 // Inaccurate rcp is allowed with afn.
938 //
939 // Defer to codegen to handle this.
940 //
941 // TODO: Decide on an interpretation for interactions between afn + arcp +
942 // !fpmath, and make it consistent between here and codegen. For now, defer
943 // expansion of afn to codegen. The current interpretation is so aggressive we
944 // don't need any pre-consideration here when we have better information. A
945 // more conservative interpretation could use handling here.
946 const bool AllowInaccurateRcp = DivFMF.approxFunc();
947 if (!RsqOp && AllowInaccurateRcp)
948 return false;
949
950 // Defer the correct implementations to codegen.
951 if (IsFloat && ReqdAccuracy < 1.0f)
952 return false;
953
954 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
955 Builder.setFastMathFlags(DivFMF);
956 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
957
958 SmallVector<Value *, 4> NumVals;
959 SmallVector<Value *, 4> DenVals;
960 SmallVector<Value *, 4> RsqDenVals;
961 extractValues(Builder, NumVals, Num);
962 extractValues(Builder, DenVals, Den);
963
964 if (RsqOp)
965 extractValues(Builder, RsqDenVals, RsqOp);
966
967 SmallVector<Value *, 4> ResultVals(NumVals.size());
968 for (int I = 0, E = NumVals.size(); I != E; ++I) {
969 Value *NumElt = NumVals[I];
970 Value *DenElt = DenVals[I];
971 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
972
973 Value *NewElt =
974 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
975 cast<Instruction>(FPOp), ReqdAccuracy);
976 if (!NewElt) {
977 // Keep the original, but scalarized.
978
979 // This has the unfortunate side effect of sometimes scalarizing when
980 // we're not going to do anything.
981 NewElt = Builder.CreateFDiv(NumElt, DenElt);
982 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
983 NewEltInst->copyMetadata(FDiv);
984 }
985
986 ResultVals[I] = NewElt;
987 }
988
989 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
990
991 if (NewVal) {
992 FDiv.replaceAllUsesWith(NewVal);
993 NewVal->takeName(&FDiv);
994 DeadVals.push_back(&FDiv);
995 }
996
997 return true;
998}
999
1000static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1001 Value *LHS, Value *RHS) {
1002 Type *I32Ty = Builder.getInt32Ty();
1003 Type *I64Ty = Builder.getInt64Ty();
1004
1005 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1006 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1007 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1008 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1009 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1010 Hi = Builder.CreateTrunc(Hi, I32Ty);
1011 return std::pair(Lo, Hi);
1012}
1013
1014static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1015 return getMul64(Builder, LHS, RHS).second;
1016}
1017
1018/// Figure out how many bits are really needed for this division.
1019/// \p MaxDivBits is an optimization hint to bypass the second
1020/// ComputeNumSignBits/computeKnownBits call if the first one is
1021/// insufficient.
1022unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1023 Value *Den,
1024 unsigned MaxDivBits,
1025 bool IsSigned) const {
1027 Den->getType()->getScalarSizeInBits());
1028 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1029 if (IsSigned) {
1030 unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
1031 // A sign bit needs to be reserved for shrinking.
1032 unsigned DivBits = SSBits - RHSSignBits + 1;
1033 if (DivBits > MaxDivBits)
1034 return SSBits;
1035
1036 unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
1037
1038 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1039 DivBits = SSBits - SignBits + 1;
1040 return DivBits;
1041 }
1042
1043 // All bits are used for unsigned division for Num or Den in range
1044 // (SignedMax, UnsignedMax].
1045 KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
1046 if (Known.isNegative() || !Known.isNonNegative())
1047 return SSBits;
1048 unsigned RHSSignBits = Known.countMinLeadingZeros();
1049 unsigned DivBits = SSBits - RHSSignBits;
1050 if (DivBits > MaxDivBits)
1051 return SSBits;
1052
1053 Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
1054 if (Known.isNegative() || !Known.isNonNegative())
1055 return SSBits;
1056 unsigned LHSSignBits = Known.countMinLeadingZeros();
1057
1058 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1059 DivBits = SSBits - SignBits;
1060 return DivBits;
1061}
1062
1063// The fractional part of a float is enough to accurately represent up to
1064// a 24-bit signed integer.
1065Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1066 BinaryOperator &I, Value *Num,
1067 Value *Den, bool IsDiv,
1068 bool IsSigned) const {
1069 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1070 if (DivBits > 24)
1071 return nullptr;
1072 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1073}
1074
1075Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1076 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1077 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1078 Type *I32Ty = Builder.getInt32Ty();
1079 Num = Builder.CreateTrunc(Num, I32Ty);
1080 Den = Builder.CreateTrunc(Den, I32Ty);
1081
1082 Type *F32Ty = Builder.getFloatTy();
1083 ConstantInt *One = Builder.getInt32(1);
1084 Value *JQ = One;
1085
1086 if (IsSigned) {
1087 // char|short jq = ia ^ ib;
1088 JQ = Builder.CreateXor(Num, Den);
1089
1090 // jq = jq >> (bitsize - 2)
1091 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1092
1093 // jq = jq | 0x1
1094 JQ = Builder.CreateOr(JQ, One);
1095 }
1096
1097 // int ia = (int)LHS;
1098 Value *IA = Num;
1099
1100 // int ib, (int)RHS;
1101 Value *IB = Den;
1102
1103 // float fa = (float)ia;
1104 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1105 : Builder.CreateUIToFP(IA, F32Ty);
1106
1107 // float fb = (float)ib;
1108 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1109 : Builder.CreateUIToFP(IB,F32Ty);
1110
1111 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1112 Builder.getFloatTy(), {FB});
1113 Value *FQM = Builder.CreateFMul(FA, RCP);
1114
1115 // fq = trunc(fqm);
1116 Value *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1117 auto *FQI = dyn_cast<Instruction>(FQ);
1118 if (FQI)
1119 FQI->copyFastMathFlags(Builder.getFastMathFlags());
1120
1121 // float fqneg = -fq;
1122 Value *FQNeg = Builder.CreateFNeg(FQ);
1123
1124 // float fr = mad(fqneg, fb, fa);
1125 auto FMAD = !ST.hasMadMacF32Insts()
1126 ? Intrinsic::fma
1127 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1128 Value *FR =
1129 Builder.CreateIntrinsic(FMAD, {FQNeg->getType()}, {FQNeg, FB, FA}, FQI);
1130
1131 // int iq = (int)fq;
1132 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1133 : Builder.CreateFPToUI(FQ, I32Ty);
1134
1135 // fr = fabs(fr);
1136 FR = Builder.CreateFAbs(FR, FQI);
1137
1138 // fb = fabs(fb);
1139 FB = Builder.CreateFAbs(FB, FQI);
1140
1141 // int cv = fr >= fb;
1142 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1143
1144 // jq = (cv ? jq : 0);
1145 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1146
1147 // dst = iq + jq;
1148 Value *Div = Builder.CreateAdd(IQ, JQ);
1149
1150 Value *Res = Div;
1151 if (!IsDiv) {
1152 // Rem needs compensation, it's easier to recompute it
1153 Value *Rem = Builder.CreateMul(Div, Den);
1154 Res = Builder.CreateSub(Num, Rem);
1155 }
1156
1157 if (DivBits != 0 && DivBits < 32) {
1158 // Extend in register from the number of bits this divide really is.
1159 if (IsSigned) {
1160 int InRegBits = 32 - DivBits;
1161
1162 Res = Builder.CreateShl(Res, InRegBits);
1163 Res = Builder.CreateAShr(Res, InRegBits);
1164 } else {
1165 ConstantInt *TruncMask
1166 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1167 Res = Builder.CreateAnd(Res, TruncMask);
1168 }
1169 }
1170
1171 return Res;
1172}
1173
1174// Try to recognize special cases the DAG will emit special, better expansions
1175// than the general expansion we do here.
1176
1177// TODO: It would be better to just directly handle those optimizations here.
1178bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1179 Value *Num,
1180 Value *Den) const {
1181 if (Constant *C = dyn_cast<Constant>(Den)) {
1182 // Arbitrary constants get a better expansion as long as a wider mulhi is
1183 // legal.
1184 if (C->getType()->getScalarSizeInBits() <= 32)
1185 return true;
1186
1187 // TODO: Sdiv check for not exact for some reason.
1188
1189 // If there's no wider mulhi, there's only a better expansion for powers of
1190 // two.
1191 // TODO: Should really know for each vector element.
1193 return true;
1194
1195 return false;
1196 }
1197
1198 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1199 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1200 if (BinOpDen->getOpcode() == Instruction::Shl &&
1201 isa<Constant>(BinOpDen->getOperand(0)) &&
1202 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
1203 SQ.getWithInstruction(&I))) {
1204 return true;
1205 }
1206 }
1207
1208 return false;
1209}
1210
1211static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1212 // Check whether the sign can be determined statically.
1213 KnownBits Known = computeKnownBits(V, DL);
1214 if (Known.isNegative())
1215 return Constant::getAllOnesValue(V->getType());
1216 if (Known.isNonNegative())
1217 return Constant::getNullValue(V->getType());
1218 return Builder.CreateAShr(V, Builder.getInt32(31));
1219}
1220
1221Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1222 BinaryOperator &I, Value *X,
1223 Value *Y) const {
1224 Instruction::BinaryOps Opc = I.getOpcode();
1225 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1226 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1227
1228 FastMathFlags FMF;
1229 FMF.setFast();
1230 Builder.setFastMathFlags(FMF);
1231
1232 if (divHasSpecialOptimization(I, X, Y))
1233 return nullptr; // Keep it for later optimization.
1234
1235 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1236 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1237
1238 Type *Ty = X->getType();
1239 Type *I32Ty = Builder.getInt32Ty();
1240 Type *F32Ty = Builder.getFloatTy();
1241
1242 if (Ty->getScalarSizeInBits() != 32) {
1243 if (IsSigned) {
1244 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1245 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1246 } else {
1247 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1248 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1249 }
1250 }
1251
1252 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1253 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1254 Builder.CreateZExtOrTrunc(Res, Ty);
1255 }
1256
1257 ConstantInt *Zero = Builder.getInt32(0);
1258 ConstantInt *One = Builder.getInt32(1);
1259
1260 Value *Sign = nullptr;
1261 if (IsSigned) {
1262 Value *SignX = getSign32(X, Builder, DL);
1263 Value *SignY = getSign32(Y, Builder, DL);
1264 // Remainder sign is the same as LHS
1265 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1266
1267 X = Builder.CreateAdd(X, SignX);
1268 Y = Builder.CreateAdd(Y, SignY);
1269
1270 X = Builder.CreateXor(X, SignX);
1271 Y = Builder.CreateXor(Y, SignY);
1272 }
1273
1274 // The algorithm here is based on ideas from "Software Integer Division", Tom
1275 // Rodeheffer, August 2008.
1276 //
1277 // unsigned udiv(unsigned x, unsigned y) {
1278 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1279 // // that this is a lower bound on inv(y), even if some of the calculations
1280 // // round up.
1281 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1282 //
1283 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1284 // // Empirically this is guaranteed to give a "two-y" lower bound on
1285 // // inv(y).
1286 // z += umulh(z, -y * z);
1287 //
1288 // // Quotient/remainder estimate.
1289 // unsigned q = umulh(x, z);
1290 // unsigned r = x - q * y;
1291 //
1292 // // Two rounds of quotient/remainder refinement.
1293 // if (r >= y) {
1294 // ++q;
1295 // r -= y;
1296 // }
1297 // if (r >= y) {
1298 // ++q;
1299 // r -= y;
1300 // }
1301 //
1302 // return q;
1303 // }
1304
1305 // Initial estimate of inv(y).
1306 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1307 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1308 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1309 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1310 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1311
1312 // One round of UNR.
1313 Value *NegY = Builder.CreateSub(Zero, Y);
1314 Value *NegYZ = Builder.CreateMul(NegY, Z);
1315 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1316
1317 // Quotient/remainder estimate.
1318 Value *Q = getMulHu(Builder, X, Z);
1319 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1320
1321 // First quotient/remainder refinement.
1322 Value *Cond = Builder.CreateICmpUGE(R, Y);
1323 if (IsDiv)
1324 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1325 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1326
1327 // Second quotient/remainder refinement.
1328 Cond = Builder.CreateICmpUGE(R, Y);
1329 Value *Res;
1330 if (IsDiv)
1331 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1332 else
1333 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1334
1335 if (IsSigned) {
1336 Res = Builder.CreateXor(Res, Sign);
1337 Res = Builder.CreateSub(Res, Sign);
1338 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1339 } else {
1340 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1341 }
1342 return Res;
1343}
1344
1345Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1346 BinaryOperator &I, Value *Num,
1347 Value *Den) const {
1348 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1349 return nullptr; // Keep it for later optimization.
1350
1351 Instruction::BinaryOps Opc = I.getOpcode();
1352
1353 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1354 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1355
1356 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1357 if (NumDivBits > 32)
1358 return nullptr;
1359
1360 Value *Narrowed = nullptr;
1361 if (NumDivBits <= 24) {
1362 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1363 IsDiv, IsSigned);
1364 } else if (NumDivBits <= 32) {
1365 Narrowed = expandDivRem32(Builder, I, Num, Den);
1366 }
1367
1368 if (Narrowed) {
1369 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1370 Builder.CreateZExt(Narrowed, Num->getType());
1371 }
1372
1373 return nullptr;
1374}
1375
1376void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1377 Instruction::BinaryOps Opc = I.getOpcode();
1378 // Do the general expansion.
1379 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1381 return;
1382 }
1383
1384 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1386 return;
1387 }
1388
1389 llvm_unreachable("not a division");
1390}
1391
1392/*
1393This will cause non-byte load in consistency, for example:
1394```
1395 %load = load i1, ptr addrspace(4) %arg, align 4
1396 %zext = zext i1 %load to
1397 i64 %add = add i64 %zext
1398```
1399Instead of creating `s_and_b32 s0, s0, 1`,
1400it will create `s_and_b32 s0, s0, 0xff`.
1401We accept this change since the non-byte load assumes the upper bits
1402within the byte are all 0.
1403*/
1404bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1405 unsigned Opc = I->getOpcode();
1406 Type *OldType = I->getType();
1407
1408 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1409 return false;
1410
1411 unsigned OrigBit = OldType->getScalarSizeInBits();
1412
1413 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1414 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1415 "Instruction::Mul.");
1416
1417 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1418
1419 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1420 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1421 if (!NewType)
1422 return false;
1423 unsigned NewBit = NewType->getIntegerBitWidth();
1424 if (NewBit >= OrigBit)
1425 return false;
1426 NewType = I->getType()->getWithNewBitWidth(NewBit);
1427
1428 // Old cost
1429 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1430 InstructionCost OldCost =
1432 // New cost of new op
1433 InstructionCost NewCost =
1435 // New cost of narrowing 2 operands (use trunc)
1436 int NumOfNonConstOps = 2;
1437 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1438 // Cannot be both constant, should be propagated
1439 NumOfNonConstOps = 1;
1440 }
1441 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1442 NewType, OldType,
1445 // New cost of zext narrowed result to original type
1446 NewCost +=
1447 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1449 if (NewCost >= OldCost)
1450 return false;
1451
1452 IRBuilder<> Builder(I);
1453 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1454 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1455 Value *Arith =
1456 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1457
1458 Value *Zext = Builder.CreateZExt(Arith, OldType);
1459 I->replaceAllUsesWith(Zext);
1460 DeadVals.push_back(I);
1461 return true;
1462}
1463
1464bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1465 if (foldBinOpIntoSelect(I))
1466 return true;
1467
1468 if (UseMul24Intrin && replaceMulWithMul24(I))
1469 return true;
1470 if (tryNarrowMathIfNoOverflow(&I))
1471 return true;
1472
1473 bool Changed = false;
1474 Instruction::BinaryOps Opc = I.getOpcode();
1475 Type *Ty = I.getType();
1476 Value *NewDiv = nullptr;
1477 unsigned ScalarSize = Ty->getScalarSizeInBits();
1478
1480
1481 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1482 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1483 ScalarSize <= 64 &&
1484 !DisableIDivExpand) {
1485 Value *Num = I.getOperand(0);
1486 Value *Den = I.getOperand(1);
1487 IRBuilder<> Builder(&I);
1488 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1489
1490 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1491 NewDiv = PoisonValue::get(VT);
1492
1493 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1494 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1495 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1496
1497 Value *NewElt;
1498 if (ScalarSize <= 32) {
1499 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1500 if (!NewElt)
1501 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1502 } else {
1503 // See if this 64-bit division can be shrunk to 32/24-bits before
1504 // producing the general expansion.
1505 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1506 if (!NewElt) {
1507 // The general 64-bit expansion introduces control flow and doesn't
1508 // return the new value. Just insert a scalar copy and defer
1509 // expanding it.
1510 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1511 // CreateBinOp does constant folding. If the operands are constant,
1512 // it will return a Constant instead of a BinaryOperator.
1513 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1514 Div64ToExpand.push_back(NewEltBO);
1515 }
1516 }
1517
1518 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1519 NewEltI->copyIRFlags(&I);
1520
1521 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1522 }
1523 } else {
1524 if (ScalarSize <= 32)
1525 NewDiv = expandDivRem32(Builder, I, Num, Den);
1526 else {
1527 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1528 if (!NewDiv)
1529 Div64ToExpand.push_back(&I);
1530 }
1531 }
1532
1533 if (NewDiv) {
1534 I.replaceAllUsesWith(NewDiv);
1535 DeadVals.push_back(&I);
1536 Changed = true;
1537 }
1538 }
1539
1540 if (ExpandDiv64InIR) {
1541 // TODO: We get much worse code in specially handled constant cases.
1542 for (BinaryOperator *Div : Div64ToExpand) {
1543 expandDivRem64(*Div);
1544 FlowChanged = true;
1545 Changed = true;
1546 }
1547 }
1548
1549 return Changed;
1550}
1551
1552bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1553 if (!WidenLoads)
1554 return false;
1555
1556 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1557 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1558 canWidenScalarExtLoad(I)) {
1559 IRBuilder<> Builder(&I);
1560 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1561
1562 Type *I32Ty = Builder.getInt32Ty();
1563 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1564 WidenLoad->copyMetadata(I);
1565
1566 // If we have range metadata, we need to convert the type, and not make
1567 // assumptions about the high bits.
1568 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1569 ConstantInt *Lower =
1570 mdconst::extract<ConstantInt>(Range->getOperand(0));
1571
1572 if (Lower->isNullValue()) {
1573 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1574 } else {
1575 Metadata *LowAndHigh[] = {
1576 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1577 // Don't make assumptions about the high bits.
1578 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1579 };
1580
1581 WidenLoad->setMetadata(LLVMContext::MD_range,
1582 MDNode::get(F.getContext(), LowAndHigh));
1583 }
1584 }
1585
1586 int TySize = DL.getTypeSizeInBits(I.getType());
1587 Type *IntNTy = Builder.getIntNTy(TySize);
1588 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1589 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1590 I.replaceAllUsesWith(ValOrig);
1591 DeadVals.push_back(&I);
1592 return true;
1593 }
1594
1595 return false;
1596}
1597
1598bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1599 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1600 if (!FPOp)
1601 return false;
1602
1603 Value *X;
1604 Value *Fract = nullptr;
1605
1606 // Match:
1607 // (x - floor(x)) >= MIN_CONSTANT ? MIN_CONSTANT : (x - floor(x))
1608 //
1609 // This is the preferred way to implement fract.
1610 // TODO: Could also match with compare against 1.0
1611 const APFloat *C;
1613 Value *FractSrc = matchFractPatImpl(*X, *C);
1614 if (!FractSrc)
1615 return false;
1616 IRBuilder<> Builder(&I);
1617 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1618 Fract = applyFractPat(Builder, FractSrc);
1619 } else {
1620 // Match patterns which may appear in legacy implementations of the fract()
1621 // function, built around the nan-avoidant minnum intrinsic. These are the
1622 // core pattern plus additional clamping of inf and nan values on the
1623 // result.
1624 Value *Cond = I.getCondition();
1625 Value *TrueVal = I.getTrueValue();
1626 Value *FalseVal = I.getFalseValue();
1627 Value *CmpVal;
1628 CmpPredicate IsNanPred;
1629
1630 // Match fract pattern with nan check.
1631 if (!match(Cond, m_FCmp(IsNanPred, m_Value(CmpVal), m_NonNaN())))
1632 return false;
1633
1634 IRBuilder<> Builder(&I);
1635 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1636
1637 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1638 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1639 // isnan(x) ? x : fract(x)
1640 Fract = applyFractPat(Builder, CmpVal);
1641 } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1642 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1643 // !isnan(x) ? fract(x) : x
1644 Fract = applyFractPat(Builder, CmpVal);
1645 } else {
1646 // Match an intermediate clamp infinity to 0 pattern. i.e.
1647 // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x
1648 CmpPredicate PredInf;
1649 Value *IfNotInf;
1650
1651 if (!match(TrueVal, m_Select(m_FCmp(PredInf, m_FAbs(m_Specific(CmpVal)),
1652 m_PosInf()),
1653 m_Value(IfNotInf), m_PosZeroFP())) ||
1654 PredInf != FCmpInst::FCMP_UNE ||
1655 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1656 return false;
1657
1658 SelectInst *ClampInfSelect = cast<SelectInst>(TrueVal);
1659
1660 // Insert before the fabs
1661 Value *InsertPt =
1662 cast<Instruction>(ClampInfSelect->getCondition())->getOperand(0);
1663
1664 Builder.SetInsertPoint(cast<Instruction>(InsertPt));
1665 Value *NewFract = applyFractPat(Builder, CmpVal);
1666 NewFract->takeName(TrueVal);
1667
1668 // Thread the new fract into the inf clamping sequence.
1669 DeadVals.push_back(ClampInfSelect->getOperand(1));
1670 ClampInfSelect->setOperand(1, NewFract);
1671
1672 // The outer select nan handling is also absorbed into the fract.
1673 Fract = ClampInfSelect;
1674 }
1675 } else
1676 return false;
1677 }
1678
1679 Fract->takeName(&I);
1680 I.replaceAllUsesWith(Fract);
1681 DeadVals.push_back(&I);
1682 return true;
1683}
1684
1685static bool areInSameBB(const Value *A, const Value *B) {
1686 const auto *IA = dyn_cast<Instruction>(A);
1687 const auto *IB = dyn_cast<Instruction>(B);
1688 return IA && IB && IA->getParent() == IB->getParent();
1689}
1690
1691// Helper for breaking large PHIs that returns true when an extractelement on V
1692// is likely to be folded away by the DAG combiner.
1694 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1695 if (!FVT)
1696 return false;
1697
1698 const Value *CurVal = V;
1699
1700 // Check for insertelements, keeping track of the elements covered.
1701 BitVector EltsCovered(FVT->getNumElements());
1702 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1703 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1704
1705 // Non constant index/out of bounds index -> folding is unlikely.
1706 // The latter is more of a sanity check because canonical IR should just
1707 // have replaced those with poison.
1708 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1709 return false;
1710
1711 const auto *VecSrc = IE->getOperand(0);
1712
1713 // If the vector source is another instruction, it must be in the same basic
1714 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1715 // unlikely to be able to do anything interesting here.
1716 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1717 return false;
1718
1719 CurVal = VecSrc;
1720 EltsCovered.set(Idx->getZExtValue());
1721
1722 // All elements covered.
1723 if (EltsCovered.all())
1724 return true;
1725 }
1726
1727 // We either didn't find a single insertelement, or the insertelement chain
1728 // ended before all elements were covered. Check for other interesting values.
1729
1730 // Constants are always interesting because we can just constant fold the
1731 // extractelements.
1732 if (isa<Constant>(CurVal))
1733 return true;
1734
1735 // shufflevector is likely to be profitable if either operand is a constant,
1736 // or if either source is in the same block.
1737 // This is because shufflevector is most often lowered as a series of
1738 // insert/extract elements anyway.
1739 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1740 return isa<Constant>(SV->getOperand(1)) ||
1741 areInSameBB(SV, SV->getOperand(0)) ||
1742 areInSameBB(SV, SV->getOperand(1));
1743 }
1744
1745 return false;
1746}
1747
1748static void collectPHINodes(const PHINode &I,
1750 const auto [It, Inserted] = SeenPHIs.insert(&I);
1751 if (!Inserted)
1752 return;
1753
1754 for (const Value *Inc : I.incoming_values()) {
1755 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1756 collectPHINodes(*PhiInc, SeenPHIs);
1757 }
1758
1759 for (const User *U : I.users()) {
1760 if (const auto *PhiU = dyn_cast<PHINode>(U))
1761 collectPHINodes(*PhiU, SeenPHIs);
1762 }
1763}
1764
1765bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1766 // Check in the cache first.
1767 if (const auto It = BreakPhiNodesCache.find(&I);
1768 It != BreakPhiNodesCache.end())
1769 return It->second;
1770
1771 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1772 // recursively consider all its users and incoming values that are also PHI
1773 // nodes. We then make a decision about all of those PHIs at once. Either they
1774 // all get broken up, or none of them do. That way, we avoid cases where a
1775 // single PHI is/is not broken and we end up reforming/exploding a vector
1776 // multiple times, or even worse, doing it in a loop.
1777 SmallPtrSet<const PHINode *, 8> WorkList;
1778 collectPHINodes(I, WorkList);
1779
1780#ifndef NDEBUG
1781 // Check that none of the PHI nodes in the worklist are in the map. If some of
1782 // them are, it means we're not good enough at collecting related PHIs.
1783 for (const PHINode *WLP : WorkList) {
1784 assert(BreakPhiNodesCache.count(WLP) == 0);
1785 }
1786#endif
1787
1788 // To consider a PHI profitable to break, we need to see some interesting
1789 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1790 // must have one to consider all PHIs breakable.
1791 //
1792 // This threshold has been determined through performance testing.
1793 //
1794 // Note that the computation below is equivalent to
1795 //
1796 // (unsigned)ceil((K / 3.0) * 2)
1797 //
1798 // It's simply written this way to avoid mixing integral/FP arithmetic.
1799 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1800 unsigned NumBreakablePHIs = 0;
1801 bool CanBreak = false;
1802 for (const PHINode *Cur : WorkList) {
1803 // Don't break PHIs that have no interesting incoming values. That is, where
1804 // there is no clear opportunity to fold the "extractelement" instructions
1805 // we would add.
1806 //
1807 // Note: IC does not run after this pass, so we're only interested in the
1808 // foldings that the DAG combiner can do.
1809 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1810 if (++NumBreakablePHIs >= Threshold) {
1811 CanBreak = true;
1812 break;
1813 }
1814 }
1815 }
1816
1817 for (const PHINode *Cur : WorkList)
1818 BreakPhiNodesCache[Cur] = CanBreak;
1819
1820 return CanBreak;
1821}
1822
1823/// Helper class for "break large PHIs" (visitPHINode).
1824///
1825/// This represents a slice of a PHI's incoming value, which is made up of:
1826/// - The type of the slice (Ty)
1827/// - The index in the incoming value's vector where the slice starts (Idx)
1828/// - The number of elements in the slice (NumElts).
1829/// It also keeps track of the NewPHI node inserted for this particular slice.
1830///
1831/// Slice examples:
1832/// <4 x i64> -> Split into four i64 slices.
1833/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1834/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1835/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1837public:
1838 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1839 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1840
1841 Type *Ty = nullptr;
1842 unsigned Idx = 0;
1843 unsigned NumElts = 0;
1844 PHINode *NewPHI = nullptr;
1845
1846 /// Slice \p Inc according to the information contained within this slice.
1847 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1848 /// pair, it returns the same Sliced value as well.
1849 ///
1850 /// Note this *intentionally* does not return the same value for, say,
1851 /// [%bb.0, %0] & [%bb.1, %0] as:
1852 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1853 /// the value in bb.1 may not be reachable from bb.0 if it's its
1854 /// predecessor.)
1855 /// - We also want to make our extract instructions as local as possible so
1856 /// the DAG has better chances of folding them out. Duplicating them like
1857 /// that is beneficial in that regard.
1858 ///
1859 /// This is both a minor optimization to avoid creating duplicate
1860 /// instructions, but also a requirement for correctness. It is not forbidden
1861 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1862 /// returned a new value each time, those previously identical pairs would all
1863 /// have different incoming values (from the same block) and it'd cause a "PHI
1864 /// node has multiple entries for the same basic block with different incoming
1865 /// values!" verifier error.
1866 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1867 Value *&Res = SlicedVals[{BB, Inc}];
1868 if (Res)
1869 return Res;
1870
1872 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1873 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1874
1875 if (NumElts > 1) {
1877 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1878 Mask.push_back(K);
1879 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1880 } else
1881 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1882
1883 return Res;
1884 }
1885
1886private:
1888};
1889
1890bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1891 // Break-up fixed-vector PHIs into smaller pieces.
1892 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1893 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1894 //
1895 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1896 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1897 // With large, odd-sized PHIs we may end up needing many `build_vector`
1898 // operations with most elements being "undef". This inhibits a lot of
1899 // optimization opportunities and can result in unreasonably high register
1900 // pressure and the inevitable stack spilling.
1901 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1902 return false;
1903
1904 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1905 if (!FVT || FVT->getNumElements() == 1 ||
1906 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1907 return false;
1908
1909 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1910 return false;
1911
1912 std::vector<VectorSlice> Slices;
1913
1914 Type *EltTy = FVT->getElementType();
1915 {
1916 unsigned Idx = 0;
1917 // For 8/16 bits type, don't scalarize fully but break it up into as many
1918 // 32-bit slices as we can, and scalarize the tail.
1919 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1920 const unsigned NumElts = FVT->getNumElements();
1921 if (EltSize == 8 || EltSize == 16) {
1922 const unsigned SubVecSize = (32 / EltSize);
1923 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1924 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1925 Idx += SubVecSize)
1926 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1927 }
1928
1929 // Scalarize all remaining elements.
1930 for (; Idx < NumElts; ++Idx)
1931 Slices.emplace_back(EltTy, Idx, 1);
1932 }
1933
1934 assert(Slices.size() > 1);
1935
1936 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1937 // creating the necessary instruction to extract the relevant slices of each
1938 // incoming value.
1939 IRBuilder<> B(I.getParent());
1940 B.SetCurrentDebugLocation(I.getDebugLoc());
1941
1942 unsigned IncNameSuffix = 0;
1943 for (VectorSlice &S : Slices) {
1944 // We need to reset the build on each iteration, because getSlicedVal may
1945 // have inserted something into I's BB.
1946 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1947 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1948
1949 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1950 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1951 "largephi.extractslice" +
1952 std::to_string(IncNameSuffix++)),
1953 BB);
1954 }
1955 }
1956
1957 // And replace this PHI with a vector of all the previous PHI values.
1958 Value *Vec = PoisonValue::get(FVT);
1959 unsigned NameSuffix = 0;
1960 for (VectorSlice &S : Slices) {
1961 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1962 if (S.NumElts > 1)
1963 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1964 else
1965 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1966 }
1967
1968 I.replaceAllUsesWith(Vec);
1969 DeadVals.push_back(&I);
1970 return true;
1971}
1972
1973/// \param V Value to check
1974/// \param DL DataLayout
1975/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1976/// \param AS Target Address Space
1977/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1978static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1979 const AMDGPUTargetMachine &TM, unsigned AS) {
1980 // Pointer cannot be null if it's a block address, GV or alloca.
1981 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1982 // it as the symbol could be null in such cases.
1984 return true;
1985
1986 // Check nonnull arguments.
1987 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
1988 return true;
1989
1990 // Check nonnull loads.
1991 if (const auto *Load = dyn_cast<LoadInst>(V);
1992 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1993 return true;
1994
1995 // getUnderlyingObject may have looked through another addrspacecast, although
1996 // the optimizable situations most likely folded out by now.
1997 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
1998 return false;
1999
2000 // TODO: Calls that return nonnull?
2001
2002 // For all other things, use KnownBits.
2003 // We either use 0 or all bits set to indicate null, so check whether the
2004 // value can be zero or all ones.
2005 //
2006 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
2007 // address spaces have non-zero null values.
2008 auto SrcPtrKB = computeKnownBits(V, DL);
2009 const auto NullVal = AMDGPU::getNullPointerValue(AS);
2010
2011 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
2012 assert((NullVal == 0 || NullVal == -1) &&
2013 "don't know how to check for this null value!");
2014 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2015}
2016
2017bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
2018 // Intrinsic doesn't support vectors, also it seems that it's often difficult
2019 // to prove that a vector cannot have any nulls in it so it's unclear if it's
2020 // worth supporting.
2021 if (I.getType()->isVectorTy())
2022 return false;
2023
2024 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
2025 // This is only worthwhile for casts from/to priv/local to flat.
2026 const unsigned SrcAS = I.getSrcAddressSpace();
2027 const unsigned DstAS = I.getDestAddressSpace();
2028
2029 bool CanLower = false;
2030 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2031 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2032 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2033 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2034 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2035 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2036 if (!CanLower)
2037 return false;
2038
2040 getUnderlyingObjects(I.getOperand(0), WorkList);
2041 if (!all_of(WorkList, [&](const Value *V) {
2042 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
2043 }))
2044 return false;
2045
2046 IRBuilder<> B(&I);
2047 auto *Intrin = B.CreateIntrinsic(
2048 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2049 I.replaceAllUsesWith(Intrin);
2050 DeadVals.push_back(&I);
2051 return true;
2052}
2053
2054bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2055 Intrinsic::ID IID = I.getIntrinsicID();
2056 switch (IID) {
2057 case Intrinsic::minnum:
2058 case Intrinsic::minimumnum:
2059 case Intrinsic::minimum:
2060 return visitFMinLike(I);
2061 case Intrinsic::sqrt:
2062 return visitSqrt(I);
2063 case Intrinsic::log:
2064 case Intrinsic::log10:
2065 return visitLog(cast<FPMathOperator>(I), IID);
2066 case Intrinsic::log2:
2067 // No reason to handle log2.
2068 return false;
2069 case Intrinsic::amdgcn_mbcnt_lo:
2070 return visitMbcntLo(I);
2071 case Intrinsic::amdgcn_mbcnt_hi:
2072 return visitMbcntHi(I);
2073 case Intrinsic::vector_reduce_add:
2074 return visitVectorReduceAdd(I);
2075 case Intrinsic::uadd_sat:
2076 case Intrinsic::sadd_sat:
2077 return visitSaturatingAdd(I);
2078 default:
2079 return false;
2080 }
2081}
2082
2083/// Match the core sequence in the fract pattern (x - floor(x), which doesn't
2084/// need to consider edge case handling.
2085Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(Value &FractSrc,
2086 const APFloat &C) const {
2087 if (ST.hasFractBug())
2088 return nullptr;
2089
2090 Type *Ty = FractSrc.getType();
2091 if (!isLegalFloatingTy(Ty->getScalarType()))
2092 return nullptr;
2093
2094 APFloat OneNextDown = APFloat::getOne(C.getSemantics());
2095 OneNextDown.next(true);
2096
2097 // Match nextafter(1.0, -1)
2098 if (OneNextDown != C)
2099 return nullptr;
2100
2101 Value *FloorSrc;
2102 if (match(&FractSrc, m_FSub(m_Value(FloorSrc), m_Intrinsic<Intrinsic::floor>(
2103 m_Deferred(FloorSrc)))))
2104 return FloorSrc;
2105 return nullptr;
2106}
2107
2108/// Match non-nan fract pattern.
2109// MIN_CONSTANT = nextafter(1.0, -1.0)
2110/// minnum(fsub(x, floor(x)), MIN_CONSTANT)
2111/// minimumnum(fsub(x, floor(x)), MIN_CONSTANT)
2112/// minimum(fsub(x, floor(x)), MIN_CONSTANT)
2113
2114// x_sub_floor >= MIN_CONSTANT ? MIN_CONSTANT : x_sub_floor;
2115///
2116/// If fract is a useful instruction for the subtarget. Does not account for the
2117/// nan handling; the instruction has a nan check on the input value.
2118Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(Value &V) {
2119 Value *Arg0;
2120 const APFloat *C;
2121
2122 // The value is only used in contexts where we know the input isn't a nan, so
2123 // any of the fmin variants are fine.
2124 if (!match(&V,
2128 return nullptr;
2129
2130 return matchFractPatImpl(*Arg0, *C);
2131}
2132
2133Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2134 Value *FractArg) {
2135 SmallVector<Value *, 4> FractVals;
2136 extractValues(Builder, FractVals, FractArg);
2137
2138 SmallVector<Value *, 4> ResultVals(FractVals.size());
2139
2140 Type *Ty = FractArg->getType()->getScalarType();
2141 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2142 ResultVals[I] =
2143 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2144 }
2145
2146 return insertValues(Builder, FractArg->getType(), ResultVals);
2147}
2148
2149bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2150 const APFloat *C;
2151 Value *FractArg;
2152
2153 // minimum(x - floor(x), MIN_CONSTANT)
2154 Value *X;
2155 if (!ST.hasFractBug() &&
2157 FractArg = matchFractPatImpl(*X, *C);
2158 if (!FractArg)
2159 return false;
2160 } else {
2161 // minnum(x - floor(x), MIN_CONSTANT)
2162 FractArg = matchFractPatNanAvoidant(I);
2163 if (!FractArg)
2164 return false;
2165
2166 // Match pattern for fract intrinsic in contexts where the nan check has
2167 // been optimized out (and hope the knowledge the source can't be nan wasn't
2168 // lost).
2169 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
2170 return false;
2171 }
2172
2173 IRBuilder<> Builder(&I);
2174 FastMathFlags FMF = I.getFastMathFlags();
2175 FMF.setNoNaNs();
2176 Builder.setFastMathFlags(FMF);
2177
2178 Value *Fract = applyFractPat(Builder, FractArg);
2179 Fract->takeName(&I);
2180 I.replaceAllUsesWith(Fract);
2181 DeadVals.push_back(&I);
2182 return true;
2183}
2184
2185// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2186bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2187 Type *Ty = Sqrt.getType()->getScalarType();
2188 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2189 return false;
2190
2191 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2192 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2193
2194 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2195 // of fast llvm.sqrt will give the raw instruction anyway.
2196 if (SqrtFMF.approxFunc())
2197 return false;
2198
2199 const float ReqdAccuracy = FPOp->getFPAccuracy();
2200
2201 // Defer correctly rounded expansion to codegen.
2202 if (ReqdAccuracy < 1.0f)
2203 return false;
2204
2205 Value *SrcVal = Sqrt.getOperand(0);
2206 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2207
2208 // The raw instruction is 1 ulp, but the correction for denormal handling
2209 // brings it to 2.
2210 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2211 return false;
2212
2213 IRBuilder<> Builder(&Sqrt);
2214 SmallVector<Value *, 4> SrcVals;
2215 extractValues(Builder, SrcVals, SrcVal);
2216
2217 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2218 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2219 if (CanTreatAsDAZ)
2220 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2221 else
2222 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2223 }
2224
2225 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2226 NewSqrt->takeName(&Sqrt);
2227 Sqrt.replaceAllUsesWith(NewSqrt);
2228 DeadVals.push_back(&Sqrt);
2229 return true;
2230}
2231
2232/// Replace log and log10 intrinsic calls based on fpmath metadata.
2233bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2234 Intrinsic::ID IID) {
2235 Type *Ty = Log.getType();
2236 if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())
2237 return false;
2238
2239 FastMathFlags FMF = Log.getFastMathFlags();
2240
2241 // Defer fast math cases to codegen.
2242 if (FMF.approxFunc())
2243 return false;
2244
2245 // Limit experimentally determined from OpenCL conformance test (1.79)
2246 if (Log.getFPAccuracy() < 1.80f)
2247 return false;
2248
2249 IRBuilder<> Builder(&cast<CallInst>(Log));
2250
2251 // Use the generic intrinsic for convenience in the vector case. Codegen will
2252 // recognize the denormal handling is not necessary from the fpext.
2253 // TODO: Move to generic code
2254 Value *Log2 =
2255 Builder.CreateUnaryIntrinsic(Intrinsic::log2, Log.getOperand(0), FMF);
2256
2257 double Log2BaseInverted =
2258 IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2259 Value *Mul =
2260 Builder.CreateFMulFMF(Log2, ConstantFP::get(Ty, Log2BaseInverted), FMF);
2261
2262 Mul->takeName(&Log);
2263
2264 Log.replaceAllUsesWith(Mul);
2265 DeadVals.push_back(&Log);
2266 return true;
2267}
2268
2269bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2270 if (skipFunction(F))
2271 return false;
2272
2273 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2274 if (!TPC)
2275 return false;
2276
2277 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2278 const TargetLibraryInfo *TLI =
2279 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2280 AssumptionCache *AC =
2281 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2282 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2283 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2284 const UniformityInfo &UA =
2285 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2286 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2287}
2288
2291 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2292 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2293 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2294 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2295 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2296 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2297 if (!Impl.run())
2298 return PreservedAnalyses::all();
2300 if (!Impl.FlowChanged)
2302 return PA;
2303}
2304
2305INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2306 "AMDGPU IR optimizations", false, false)
2310INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2312
2313/// Create a workitem.id.x intrinsic call with range metadata.
2314CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2315 CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2316 ST.makeLIDRangeMetadata(Tid);
2317 return Tid;
2318}
2319
2320/// Replace the instruction with a direct workitem.id.x call.
2321void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2322 IRBuilder<> B(&I);
2323 CallInst *Tid = createWorkitemIdX(B);
2325 ReplaceInstWithValue(BI, Tid);
2326}
2327
2328/// Replace the instruction with (workitem.id.x & mask).
2329void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2330 Instruction &I, unsigned WaveSize) const {
2331 IRBuilder<> B(&I);
2332 CallInst *Tid = createWorkitemIdX(B);
2333 Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
2334 Value *AndInst = B.CreateAnd(Tid, Mask);
2336 ReplaceInstWithValue(BI, AndInst);
2337}
2338
2339/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2340/// work group size allows direct computation of lane ID.
2341/// Returns true if optimization was applied, false otherwise.
2342bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2343 unsigned Wave) const {
2344 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
2345 if (!MaybeX)
2346 return false;
2347
2348 // When work group size == wave_size, each work group contains exactly one
2349 // wave, so the instruction can be replaced with workitem.id.x directly.
2350 if (*MaybeX == Wave) {
2351 replaceWithWorkitemIdX(I);
2352 return true;
2353 }
2354
2355 // When work group evenly splits into waves, compute lane ID within wave
2356 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2357 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
2358 replaceWithMaskedWorkitemIdX(I, Wave);
2359 return true;
2360 }
2361
2362 return false;
2363}
2364
2365/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2366bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2367 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2368 // the full execution mask.
2369 if (!ST.isWave32())
2370 return false;
2371
2372 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2373 // lower IDs.
2374 if (!match(&I,
2376 return false;
2377
2378 return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
2379}
2380
2381/// Optimize mbcnt.hi calls for lane ID computation.
2382bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2383 // Abort if wave size is not known at compile time.
2384 if (!ST.isWaveSizeKnown())
2385 return false;
2386
2387 unsigned Wave = ST.getWavefrontSize();
2388
2389 // On wave32, the upper 32 bits of execution mask are always 0, so
2390 // mbcnt.hi(mask, val) always returns val unchanged.
2391 if (ST.isWave32()) {
2392 if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
2393 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2394 // wave size (single wave per work group).
2395 if (*MaybeX == Wave) {
2397 ReplaceInstWithValue(BI, I.getArgOperand(1));
2398 return true;
2399 }
2400 }
2401 }
2402
2403 // Optimize the complete lane ID computation pattern:
2404 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2405 // across the full execution mask.
2406 using namespace PatternMatch;
2407
2408 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2411 m_AllOnes(), m_Zero()))))
2412 return false;
2413
2414 return tryReplaceWithWorkitemId(I, Wave);
2415}
2416
2417/// Check if type is <4 x i8>.
2418static bool isV4I8(Type *Ty) {
2420 return VTy && VTy->getNumElements() == 4 &&
2421 VTy->getElementType()->isIntegerTy(8);
2422}
2423
2424/// Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x
2425/// i8>) Returns true if pattern matches and signedness matches IsSigned.
2426/// Sets A, B to the <4 x i8> sources.
2427static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B,
2428 bool IsSigned) {
2429 Value *Src0, *Src1;
2430 if (!match(MulOp, m_Mul(m_Value(Src0), m_Value(Src1))))
2431 return false;
2432
2433 // Check that result type is <4 x i32>
2435 if (!MulTy || MulTy->getNumElements() != 4 ||
2436 !MulTy->getElementType()->isIntegerTy(32))
2437 return false;
2438
2439 // Match zext or sext based on IsSigned
2440 Value *ExtSrc0, *ExtSrc1;
2441 if (IsSigned) {
2442 if (!match(Src0, m_SExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))
2443 return false;
2444 if (!match(Src1, m_SExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))
2445 return false;
2446 } else {
2447 if (!match(Src0, m_ZExt(m_Value(ExtSrc0))) || !isV4I8(ExtSrc0->getType()))
2448 return false;
2449 if (!match(Src1, m_ZExt(m_Value(ExtSrc1))) || !isV4I8(ExtSrc1->getType()))
2450 return false;
2451 }
2452
2453 A = ExtSrc0;
2454 B = ExtSrc1;
2455 return true;
2456}
2457
2458/// Try to convert vector.reduce.add(mul(zext/sext <4 x i8>, zext/sext <4 x
2459/// i8>)) to a dot4 intrinsic call (non-saturating case only).
2460bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &I) {
2461 // Check if we have dot4 instructions available
2462 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2463 return false;
2464
2465 Value *A = nullptr, *B = nullptr;
2466
2467 // Try unsigned first, then signed
2468 bool IsSigned = false;
2469 if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/false)) {
2470 if (!matchDot4Pattern(I.getArgOperand(0), A, B, /*IsSigned=*/true))
2471 return false;
2472 IsSigned = true;
2473 }
2474
2475 LLVMContext &Ctx = I.getContext();
2476 Type *I32Ty = Type::getInt32Ty(Ctx);
2477 IRBuilder<> Builder(&I);
2478
2479 // Bitcast <4 x i8> to i32
2480 Value *ASrc = Builder.CreateBitCast(A, I32Ty);
2481 Value *BSrc = Builder.CreateBitCast(B, I32Ty);
2482
2483 // Non-saturating case: accumulator is 0, clamp is false
2484 Value *Acc = ConstantInt::get(I32Ty, 0);
2485 Value *Clamp = ConstantInt::getFalse(Ctx);
2486
2487 Intrinsic::ID DotIID =
2488 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2489
2490 Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Acc, Clamp});
2491 Dot->takeName(&I);
2492
2493 I.replaceAllUsesWith(Dot);
2494 DeadVals.push_back(&I);
2495
2496 return true;
2497}
2498
2499/// Try to convert uadd.sat/sadd.sat(vector.reduce.add(mul(...)), c) to a
2500/// saturating dot4 intrinsic. This combine starts at the root (saturating add)
2501/// and looks at its operands.
2502bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &I) {
2503 // Check if we have dot4 instructions available
2504 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2505 return false;
2506
2507 Intrinsic::ID IID = I.getIntrinsicID();
2508 bool IsSigned = (IID == Intrinsic::sadd_sat);
2509
2510 // Look for vector.reduce.add as one of the operands (commutative match)
2511 Value *Op0 = I.getArgOperand(0);
2512 Value *Op1 = I.getArgOperand(1);
2513 Value *MulOp = nullptr;
2514 Value *Accum = nullptr;
2515 IntrinsicInst *ReduceInst = nullptr;
2516
2518 ReduceInst = cast<IntrinsicInst>(Op0);
2519 Accum = Op1;
2520 } else if (match(Op1,
2522 ReduceInst = cast<IntrinsicInst>(Op1);
2523 Accum = Op0;
2524 } else {
2525 return false;
2526 }
2527
2528 Value *A = nullptr, *B = nullptr;
2529
2530 if (!matchDot4Pattern(MulOp, A, B, IsSigned))
2531 return false;
2532
2533 LLVMContext &Ctx = I.getContext();
2534 Type *I32Ty = Type::getInt32Ty(Ctx);
2535 IRBuilder<> Builder(&I);
2536
2537 // Bitcast <4 x i8> to i32
2538 Value *ASrc = Builder.CreateBitCast(A, I32Ty);
2539 Value *BSrc = Builder.CreateBitCast(B, I32Ty);
2540
2541 // Saturating case: use the accumulator and set clamp to true
2542 Value *Clamp = ConstantInt::getTrue(Ctx);
2543
2544 Intrinsic::ID DotIID =
2545 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2546
2547 Value *Dot = Builder.CreateIntrinsic(DotIID, {}, {ASrc, BSrc, Accum, Clamp});
2548 Dot->takeName(&I);
2549
2550 I.replaceAllUsesWith(Dot);
2551 DeadVals.push_back(&I);
2552 // The reduce.add will be dead after this and cleaned up later
2553 if (ReduceInst->use_empty())
2554 DeadVals.push_back(ReduceInst);
2555
2556 return true;
2557}
2558
2559char AMDGPUCodeGenPrepare::ID = 0;
2560
2562 return new AMDGPUCodeGenPrepare();
2563}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B, bool IsSigned)
Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x i8>) Returns true if pattern...
static bool isV4I8(Type *Ty)
Check if type is <4 x i8>.
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
Definition APFloat.h:1147
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
opStatus next(bool nextDown)
Definition APFloat.h:1313
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:409
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
bool all() const
Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:512
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:674
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:202
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:291
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
void setFast(bool B=true)
Definition FMF.h:96
bool noInfs() const
Definition FMF.h:66
bool allowReciprocal() const
Definition FMF.h:68
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:65
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isWave32() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
bool hasFractBug() const
bool isUniformAtDef(ConstValueRefT V) const
Whether V is uniform/non-divergent at its definition.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1715
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2193
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2138
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2674
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2166
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2132
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:586
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2180
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:352
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2429
Value * CreateFAbs(Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fabs intrinsic.
Definition IRBuilder.h:1048
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1842
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1461
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1115
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2242
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1918
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:341
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2120
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2414
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1592
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:614
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2553
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2106
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1753
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2386
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1644
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1696
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1851
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1614
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2153
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1478
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2424
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2173
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
Base class for instruction visitors.
Definition InstVisitor.h:78
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:552
bool use_empty() const
Definition Value.h:346
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
constexpr double ln2
constexpr double ln10
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:362
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:106
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
const DataLayout & DL
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC