LLVM 17.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
23#include "llvm/IR/Dominators.h"
24#include "llvm/IR/IRBuilder.h"
25#include "llvm/IR/InstVisitor.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
28#include "llvm/Pass.h"
31
32#define DEBUG_TYPE "amdgpu-codegenprepare"
33
34using namespace llvm;
35
36namespace {
37
39 "amdgpu-codegenprepare-widen-constant-loads",
40 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
42 cl::init(false));
43
44static cl::opt<bool> Widen16BitOps(
45 "amdgpu-codegenprepare-widen-16-bit-ops",
46 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
48 cl::init(true));
49
50static cl::opt<bool> UseMul24Intrin(
51 "amdgpu-codegenprepare-mul24",
52 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
54 cl::init(true));
55
56// Legalize 64-bit division by using the generic IR expansion.
57static cl::opt<bool> ExpandDiv64InIR(
58 "amdgpu-codegenprepare-expand-div64",
59 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
61 cl::init(false));
62
63// Leave all division operations as they are. This supersedes ExpandDiv64InIR
64// and is used for testing the legalizer.
65static cl::opt<bool> DisableIDivExpand(
66 "amdgpu-codegenprepare-disable-idiv-expansion",
67 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
69 cl::init(false));
70
71class AMDGPUCodeGenPrepare : public FunctionPass,
72 public InstVisitor<AMDGPUCodeGenPrepare, bool> {
73 const GCNSubtarget *ST = nullptr;
74 AssumptionCache *AC = nullptr;
75 DominatorTree *DT = nullptr;
76 UniformityInfo *UA = nullptr;
77 Module *Mod = nullptr;
78 const DataLayout *DL = nullptr;
79 bool HasUnsafeFPMath = false;
80 bool HasFP32Denormals = false;
81
82 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
83 /// binary operation \p V.
84 ///
85 /// \returns Binary operation \p V.
86 /// \returns \p T's base element bit width.
87 unsigned getBaseElementBitWidth(const Type *T) const;
88
89 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
90 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
91 /// is returned.
92 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
93
94 /// \returns True if binary operation \p I is a signed binary operation, false
95 /// otherwise.
96 bool isSigned(const BinaryOperator &I) const;
97
98 /// \returns True if the condition of 'select' operation \p I comes from a
99 /// signed 'icmp' operation, false otherwise.
100 bool isSigned(const SelectInst &I) const;
101
102 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
103 /// false otherwise.
104 bool needsPromotionToI32(const Type *T) const;
105
106 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
107 /// operation.
108 ///
109 /// \details \p I's base element bit width must be greater than 1 and less
110 /// than or equal 16. Promotion is done by sign or zero extending operands to
111 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
112 /// truncating the result of 32 bit binary operation back to \p I's original
113 /// type. Division operation is not promoted.
114 ///
115 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
116 /// false otherwise.
117 bool promoteUniformOpToI32(BinaryOperator &I) const;
118
119 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
120 ///
121 /// \details \p I's base element bit width must be greater than 1 and less
122 /// than or equal 16. Promotion is done by sign or zero extending operands to
123 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
124 ///
125 /// \returns True.
126 bool promoteUniformOpToI32(ICmpInst &I) const;
127
128 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
129 /// operation.
130 ///
131 /// \details \p I's base element bit width must be greater than 1 and less
132 /// than or equal 16. Promotion is done by sign or zero extending operands to
133 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
134 /// result of 32 bit 'select' operation back to \p I's original type.
135 ///
136 /// \returns True.
137 bool promoteUniformOpToI32(SelectInst &I) const;
138
139 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
140 /// intrinsic.
141 ///
142 /// \details \p I's base element bit width must be greater than 1 and less
143 /// than or equal 16. Promotion is done by zero extending the operand to 32
144 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
145 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
146 /// shift amount is 32 minus \p I's base element bit width), and truncating
147 /// the result of the shift operation back to \p I's original type.
148 ///
149 /// \returns True.
150 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
151
152 /// \returns The minimum number of bits needed to store the value of \Op as an
153 /// unsigned integer. Truncating to this size and then zero-extending to
154 /// the original will not change the value.
155 unsigned numBitsUnsigned(Value *Op) const;
156
157 /// \returns The minimum number of bits needed to store the value of \Op as a
158 /// signed integer. Truncating to this size and then sign-extending to
159 /// the original size will not change the value.
160 unsigned numBitsSigned(Value *Op) const;
161
162 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
163 /// SelectionDAG has an issue where an and asserting the bits are known
164 bool replaceMulWithMul24(BinaryOperator &I) const;
165
166 /// Perform same function as equivalently named function in DAGCombiner. Since
167 /// we expand some divisions here, we need to perform this before obscuring.
168 bool foldBinOpIntoSelect(BinaryOperator &I) const;
169
170 bool divHasSpecialOptimization(BinaryOperator &I,
171 Value *Num, Value *Den) const;
172 int getDivNumBits(BinaryOperator &I,
173 Value *Num, Value *Den,
174 unsigned AtLeast, bool Signed) const;
175
176 /// Expands 24 bit div or rem.
177 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
178 Value *Num, Value *Den,
179 bool IsDiv, bool IsSigned) const;
180
181 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
182 Value *Num, Value *Den, unsigned NumBits,
183 bool IsDiv, bool IsSigned) const;
184
185 /// Expands 32 bit div or rem.
186 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
187 Value *Num, Value *Den) const;
188
189 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den) const;
191 void expandDivRem64(BinaryOperator &I) const;
192
193 /// Widen a scalar load.
194 ///
195 /// \details \p Widen scalar load for uniform, small type loads from constant
196 // memory / to a full 32-bits and then truncate the input to allow a scalar
197 // load instead of a vector load.
198 //
199 /// \returns True.
200
201 bool canWidenScalarExtLoad(LoadInst &I) const;
202
203public:
204 static char ID;
205
206 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
207
208 bool visitFDiv(BinaryOperator &I);
209 bool visitXor(BinaryOperator &I);
210
211 bool visitInstruction(Instruction &I) { return false; }
213 bool visitLoadInst(LoadInst &I);
214 bool visitICmpInst(ICmpInst &I);
216
218 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
219
220 bool doInitialization(Module &M) override;
221 bool runOnFunction(Function &F) override;
222
223 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
224
225 void getAnalysisUsage(AnalysisUsage &AU) const override {
228
229 // FIXME: Division expansion needs to preserve the dominator tree.
230 if (!ExpandDiv64InIR)
231 AU.setPreservesAll();
232 }
233};
234
235} // end anonymous namespace
236
237unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
238 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
239
240 if (T->isIntegerTy())
241 return T->getIntegerBitWidth();
242 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
243}
244
245Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
246 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
247
248 if (T->isIntegerTy())
249 return B.getInt32Ty();
250 return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
251}
252
253bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
254 return I.getOpcode() == Instruction::AShr ||
255 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
256}
257
258bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
259 return isa<ICmpInst>(I.getOperand(0)) ?
260 cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
261}
262
263bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
264 if (!Widen16BitOps)
265 return false;
266
267 const IntegerType *IntTy = dyn_cast<IntegerType>(T);
268 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
269 return true;
270
271 if (const VectorType *VT = dyn_cast<VectorType>(T)) {
272 // TODO: The set of packed operations is more limited, so may want to
273 // promote some anyway.
274 if (ST->hasVOP3PInsts())
275 return false;
276
277 return needsPromotionToI32(VT->getElementType());
278 }
279
280 return false;
281}
282
283// Return true if the op promoted to i32 should have nsw set.
284static bool promotedOpIsNSW(const Instruction &I) {
285 switch (I.getOpcode()) {
286 case Instruction::Shl:
287 case Instruction::Add:
288 case Instruction::Sub:
289 return true;
290 case Instruction::Mul:
291 return I.hasNoUnsignedWrap();
292 default:
293 return false;
294 }
295}
296
297// Return true if the op promoted to i32 should have nuw set.
298static bool promotedOpIsNUW(const Instruction &I) {
299 switch (I.getOpcode()) {
300 case Instruction::Shl:
301 case Instruction::Add:
302 case Instruction::Mul:
303 return true;
304 case Instruction::Sub:
305 return I.hasNoUnsignedWrap();
306 default:
307 return false;
308 }
309}
310
311bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
312 Type *Ty = I.getType();
313 const DataLayout &DL = Mod->getDataLayout();
314 int TySize = DL.getTypeSizeInBits(Ty);
315 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
316
317 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
318}
319
320bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
321 assert(needsPromotionToI32(I.getType()) &&
322 "I does not need promotion to i32");
323
324 if (I.getOpcode() == Instruction::SDiv ||
325 I.getOpcode() == Instruction::UDiv ||
326 I.getOpcode() == Instruction::SRem ||
327 I.getOpcode() == Instruction::URem)
328 return false;
329
331 Builder.SetCurrentDebugLocation(I.getDebugLoc());
332
333 Type *I32Ty = getI32Ty(Builder, I.getType());
334 Value *ExtOp0 = nullptr;
335 Value *ExtOp1 = nullptr;
336 Value *ExtRes = nullptr;
337 Value *TruncRes = nullptr;
338
339 if (isSigned(I)) {
340 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
341 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
342 } else {
343 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
344 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
345 }
346
347 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
348 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
349 if (promotedOpIsNSW(cast<Instruction>(I)))
350 Inst->setHasNoSignedWrap();
351
352 if (promotedOpIsNUW(cast<Instruction>(I)))
353 Inst->setHasNoUnsignedWrap();
354
355 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
356 Inst->setIsExact(ExactOp->isExact());
357 }
358
359 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
360
361 I.replaceAllUsesWith(TruncRes);
362 I.eraseFromParent();
363
364 return true;
365}
366
367bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
368 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
369 "I does not need promotion to i32");
370
372 Builder.SetCurrentDebugLocation(I.getDebugLoc());
373
374 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
375 Value *ExtOp0 = nullptr;
376 Value *ExtOp1 = nullptr;
377 Value *NewICmp = nullptr;
378
379 if (I.isSigned()) {
380 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
381 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
382 } else {
383 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
384 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
385 }
386 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
387
388 I.replaceAllUsesWith(NewICmp);
389 I.eraseFromParent();
390
391 return true;
392}
393
394bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
395 assert(needsPromotionToI32(I.getType()) &&
396 "I does not need promotion to i32");
397
399 Builder.SetCurrentDebugLocation(I.getDebugLoc());
400
401 Type *I32Ty = getI32Ty(Builder, I.getType());
402 Value *ExtOp1 = nullptr;
403 Value *ExtOp2 = nullptr;
404 Value *ExtRes = nullptr;
405 Value *TruncRes = nullptr;
406
407 if (isSigned(I)) {
408 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
409 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
410 } else {
411 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
412 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
413 }
414 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
415 TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
416
417 I.replaceAllUsesWith(TruncRes);
418 I.eraseFromParent();
419
420 return true;
421}
422
423bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
424 IntrinsicInst &I) const {
425 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
426 "I must be bitreverse intrinsic");
427 assert(needsPromotionToI32(I.getType()) &&
428 "I does not need promotion to i32");
429
431 Builder.SetCurrentDebugLocation(I.getDebugLoc());
432
433 Type *I32Ty = getI32Ty(Builder, I.getType());
434 Function *I32 =
435 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
436 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
437 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
438 Value *LShrOp =
439 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
440 Value *TruncRes =
441 Builder.CreateTrunc(LShrOp, I.getType());
442
443 I.replaceAllUsesWith(TruncRes);
444 I.eraseFromParent();
445
446 return true;
447}
448
449unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const {
450 return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
451}
452
453unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const {
454 return ComputeMaxSignificantBits(Op, *DL, 0, AC);
455}
456
457static void extractValues(IRBuilder<> &Builder,
458 SmallVectorImpl<Value *> &Values, Value *V) {
459 auto *VT = dyn_cast<FixedVectorType>(V->getType());
460 if (!VT) {
461 Values.push_back(V);
462 return;
463 }
464
465 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
466 Values.push_back(Builder.CreateExtractElement(V, I));
467}
468
470 Type *Ty,
471 SmallVectorImpl<Value *> &Values) {
472 if (!Ty->isVectorTy()) {
473 assert(Values.size() == 1);
474 return Values[0];
475 }
476
477 Value *NewVal = PoisonValue::get(Ty);
478 for (int I = 0, E = Values.size(); I != E; ++I)
479 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
480
481 return NewVal;
482}
483
484// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
485// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
486// width of the original destination.
487static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
488 unsigned Size, unsigned NumBits, bool IsSigned) {
489 if (Size <= 32 || NumBits <= 32) {
491 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
492 return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
493 }
494
495 assert(NumBits <= 48);
496
497 Intrinsic::ID LoID =
498 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
499 Intrinsic::ID HiID =
500 IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
501
502 Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
503 Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
504
505 IntegerType *I64Ty = Builder.getInt64Ty();
506 Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
507 Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
508
509 return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
510}
511
512bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
513 if (I.getOpcode() != Instruction::Mul)
514 return false;
515
516 Type *Ty = I.getType();
517 unsigned Size = Ty->getScalarSizeInBits();
518 if (Size <= 16 && ST->has16BitInsts())
519 return false;
520
521 // Prefer scalar if this could be s_mul_i32
522 if (UA->isUniform(&I))
523 return false;
524
525 Value *LHS = I.getOperand(0);
526 Value *RHS = I.getOperand(1);
528 Builder.SetCurrentDebugLocation(I.getDebugLoc());
529
530 unsigned LHSBits = 0, RHSBits = 0;
531 bool IsSigned = false;
532
533 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
534 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
535 IsSigned = false;
536
537 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
538 (RHSBits = numBitsSigned(RHS)) <= 24) {
539 IsSigned = true;
540
541 } else
542 return false;
543
546 SmallVector<Value *, 4> ResultVals;
547 extractValues(Builder, LHSVals, LHS);
548 extractValues(Builder, RHSVals, RHS);
549
550 IntegerType *I32Ty = Builder.getInt32Ty();
551 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
552 Value *LHS, *RHS;
553 if (IsSigned) {
554 LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
555 RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
556 } else {
557 LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
558 RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
559 }
560
561 Value *Result =
562 getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
563
564 if (IsSigned) {
565 ResultVals.push_back(
566 Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
567 } else {
568 ResultVals.push_back(
569 Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
570 }
571 }
572
573 Value *NewVal = insertValues(Builder, Ty, ResultVals);
574 NewVal->takeName(&I);
575 I.replaceAllUsesWith(NewVal);
576 I.eraseFromParent();
577
578 return true;
579}
580
581// Find a select instruction, which may have been casted. This is mostly to deal
582// with cases where i16 selects were promoted here to i32.
584 Cast = nullptr;
585 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
586 return Sel;
587
588 if ((Cast = dyn_cast<CastInst>(V))) {
589 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
590 return Sel;
591 }
592
593 return nullptr;
594}
595
596bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
597 // Don't do this unless the old select is going away. We want to eliminate the
598 // binary operator, not replace a binop with a select.
599 int SelOpNo = 0;
600
601 CastInst *CastOp;
602
603 // TODO: Should probably try to handle some cases with multiple
604 // users. Duplicating the select may be profitable for division.
605 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
606 if (!Sel || !Sel->hasOneUse()) {
607 SelOpNo = 1;
608 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
609 }
610
611 if (!Sel || !Sel->hasOneUse())
612 return false;
613
614 Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
615 Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
616 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
617 if (!CBO || !CT || !CF)
618 return false;
619
620 if (CastOp) {
621 if (!CastOp->hasOneUse())
622 return false;
623 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
624 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
625 }
626
627 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
628 // need to handle divisions here.
629 Constant *FoldedT = SelOpNo ?
630 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
632 if (!FoldedT || isa<ConstantExpr>(FoldedT))
633 return false;
634
635 Constant *FoldedF = SelOpNo ?
636 ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
638 if (!FoldedF || isa<ConstantExpr>(FoldedF))
639 return false;
640
641 IRBuilder<> Builder(&BO);
642 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
643 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
644 Builder.setFastMathFlags(FPOp->getFastMathFlags());
645
646 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
647 FoldedT, FoldedF);
648 NewSelect->takeName(&BO);
649 BO.replaceAllUsesWith(NewSelect);
650 BO.eraseFromParent();
651 if (CastOp)
652 CastOp->eraseFromParent();
653 Sel->eraseFromParent();
654 return true;
655}
656
657// Optimize fdiv with rcp:
658//
659// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
660// allowed with unsafe-fp-math or afn.
661//
662// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
663static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
664 bool RcpIsAccurate, IRBuilder<> &Builder,
665 Module *Mod) {
666
667 if (!AllowInaccurateRcp && !RcpIsAccurate)
668 return nullptr;
669
670 Type *Ty = Den->getType();
671 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
672 if (AllowInaccurateRcp || RcpIsAccurate) {
673 if (CLHS->isExactlyValue(1.0)) {
675 Mod, Intrinsic::amdgcn_rcp, Ty);
676
677 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
678 // the CI documentation has a worst case error of 1 ulp.
679 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
680 // use it as long as we aren't trying to use denormals.
681 //
682 // v_rcp_f16 and v_rsq_f16 DO support denormals.
683
684 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
685 // insert rsq intrinsic here.
686
687 // 1.0 / x -> rcp(x)
688 return Builder.CreateCall(Decl, { Den });
689 }
690
691 // Same as for 1.0, but expand the sign out of the constant.
692 if (CLHS->isExactlyValue(-1.0)) {
694 Mod, Intrinsic::amdgcn_rcp, Ty);
695
696 // -1.0 / x -> rcp (fneg x)
697 Value *FNeg = Builder.CreateFNeg(Den);
698 return Builder.CreateCall(Decl, { FNeg });
699 }
700 }
701 }
702
703 if (AllowInaccurateRcp) {
705 Mod, Intrinsic::amdgcn_rcp, Ty);
706
707 // Turn into multiply by the reciprocal.
708 // x / y -> x * (1.0 / y)
709 Value *Recip = Builder.CreateCall(Decl, { Den });
710 return Builder.CreateFMul(Num, Recip);
711 }
712 return nullptr;
713}
714
715// optimize with fdiv.fast:
716//
717// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
718//
719// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
720//
721// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
722static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
723 bool HasDenormals, IRBuilder<> &Builder,
724 Module *Mod) {
725 // fdiv.fast can achieve 2.5 ULP accuracy.
726 if (ReqdAccuracy < 2.5f)
727 return nullptr;
728
729 // Only have fdiv.fast for f32.
730 Type *Ty = Den->getType();
731 if (!Ty->isFloatTy())
732 return nullptr;
733
734 bool NumIsOne = false;
735 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
736 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
737 NumIsOne = true;
738 }
739
740 // fdiv does not support denormals. But 1.0/x is always fine to use it.
741 if (HasDenormals && !NumIsOne)
742 return nullptr;
743
744 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
745 return Builder.CreateCall(Decl, { Num, Den });
746}
747
748// Optimizations is performed based on fpmath, fast math flags as well as
749// denormals to optimize fdiv with either rcp or fdiv.fast.
750//
751// With rcp:
752// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
753// allowed with unsafe-fp-math or afn.
754//
755// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
756//
757// With fdiv.fast:
758// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
759//
760// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
761//
762// NOTE: rcp is the preference in cases that both are legal.
763bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
764
765 Type *Ty = FDiv.getType()->getScalarType();
766
767 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
768 // expansion around them in codegen.
769 if (Ty->isDoubleTy())
770 return false;
771
772 // No intrinsic for fdiv16 if target does not support f16.
773 if (Ty->isHalfTy() && !ST->has16BitInsts())
774 return false;
775
776 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
777 const float ReqdAccuracy = FPOp->getFPAccuracy();
778
779 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
780 FastMathFlags FMF = FPOp->getFastMathFlags();
781 const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
782
783 // rcp_f16 is accurate for !fpmath >= 1.0ulp.
784 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
785 // rcp_f64 is never accurate.
786 const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
787 (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
788
789 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
790 Builder.setFastMathFlags(FMF);
791 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
792
793 Value *Num = FDiv.getOperand(0);
794 Value *Den = FDiv.getOperand(1);
795
796 Value *NewFDiv = nullptr;
797 if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
798 NewFDiv = PoisonValue::get(VT);
799
800 // FIXME: Doesn't do the right thing for cases where the vector is partially
801 // constant. This works when the scalarizer pass is run first.
802 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
803 Value *NumEltI = Builder.CreateExtractElement(Num, I);
804 Value *DenEltI = Builder.CreateExtractElement(Den, I);
805 // Try rcp first.
806 Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
807 RcpIsAccurate, Builder, Mod);
808 if (!NewElt) // Try fdiv.fast.
809 NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
810 HasFP32Denormals, Builder, Mod);
811 if (!NewElt) // Keep the original.
812 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
813
814 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
815 }
816 } else { // Scalar FDiv.
817 // Try rcp first.
818 NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
819 Builder, Mod);
820 if (!NewFDiv) { // Try fdiv.fast.
821 NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
822 Builder, Mod);
823 }
824 }
825
826 if (NewFDiv) {
827 FDiv.replaceAllUsesWith(NewFDiv);
828 NewFDiv->takeName(&FDiv);
829 FDiv.eraseFromParent();
830 }
831
832 return !!NewFDiv;
833}
834
835bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
836 // Match the Xor instruction, its type and its operands
837 IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
838 ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
839 if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
840 return visitBinaryOperator(I);
841
842 // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
843 // has only one use
844 if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
845 !IntrinsicCall->hasOneUse())
846 return visitBinaryOperator(I);
847
848 // "Not" the second argument of the intrinsic call
849 ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
850 if (!Arg)
851 return visitBinaryOperator(I);
852
853 IntrinsicCall->setOperand(
854 1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
855 I.replaceAllUsesWith(IntrinsicCall);
856 I.eraseFromParent();
857 return true;
858}
859
860static bool hasUnsafeFPMath(const Function &F) {
861 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
862 return Attr.getValueAsBool();
863}
864
865static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
866 Value *LHS, Value *RHS) {
867 Type *I32Ty = Builder.getInt32Ty();
868 Type *I64Ty = Builder.getInt64Ty();
869
870 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
871 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
872 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
873 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
874 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
875 Hi = Builder.CreateTrunc(Hi, I32Ty);
876 return std::pair(Lo, Hi);
877}
878
879static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
880 return getMul64(Builder, LHS, RHS).second;
881}
882
883/// Figure out how many bits are really needed for this division. \p AtLeast is
884/// an optimization hint to bypass the second ComputeNumSignBits call if we the
885/// first one is insufficient. Returns -1 on failure.
886int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
887 Value *Num, Value *Den,
888 unsigned AtLeast, bool IsSigned) const {
889 const DataLayout &DL = Mod->getDataLayout();
890 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
891 if (LHSSignBits < AtLeast)
892 return -1;
893
894 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
895 if (RHSSignBits < AtLeast)
896 return -1;
897
898 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
899 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
900 if (IsSigned)
901 ++DivBits;
902 return DivBits;
903}
904
905// The fractional part of a float is enough to accurately represent up to
906// a 24-bit signed integer.
907Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
909 Value *Num, Value *Den,
910 bool IsDiv, bool IsSigned) const {
911 int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
912 if (DivBits == -1)
913 return nullptr;
914 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
915}
916
917Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
919 Value *Num, Value *Den,
920 unsigned DivBits,
921 bool IsDiv, bool IsSigned) const {
922 Type *I32Ty = Builder.getInt32Ty();
923 Num = Builder.CreateTrunc(Num, I32Ty);
924 Den = Builder.CreateTrunc(Den, I32Ty);
925
926 Type *F32Ty = Builder.getFloatTy();
927 ConstantInt *One = Builder.getInt32(1);
928 Value *JQ = One;
929
930 if (IsSigned) {
931 // char|short jq = ia ^ ib;
932 JQ = Builder.CreateXor(Num, Den);
933
934 // jq = jq >> (bitsize - 2)
935 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
936
937 // jq = jq | 0x1
938 JQ = Builder.CreateOr(JQ, One);
939 }
940
941 // int ia = (int)LHS;
942 Value *IA = Num;
943
944 // int ib, (int)RHS;
945 Value *IB = Den;
946
947 // float fa = (float)ia;
948 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
949 : Builder.CreateUIToFP(IA, F32Ty);
950
951 // float fb = (float)ib;
952 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
953 : Builder.CreateUIToFP(IB,F32Ty);
954
955 Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
956 Builder.getFloatTy());
957 Value *RCP = Builder.CreateCall(RcpDecl, { FB });
958 Value *FQM = Builder.CreateFMul(FA, RCP);
959
960 // fq = trunc(fqm);
961 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
962 FQ->copyFastMathFlags(Builder.getFastMathFlags());
963
964 // float fqneg = -fq;
965 Value *FQNeg = Builder.CreateFNeg(FQ);
966
967 // float fr = mad(fqneg, fb, fa);
968 auto FMAD = !ST->hasMadMacF32Insts()
969 ? Intrinsic::fma
970 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
971 Value *FR = Builder.CreateIntrinsic(FMAD,
972 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
973
974 // int iq = (int)fq;
975 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
976 : Builder.CreateFPToUI(FQ, I32Ty);
977
978 // fr = fabs(fr);
979 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
980
981 // fb = fabs(fb);
982 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
983
984 // int cv = fr >= fb;
985 Value *CV = Builder.CreateFCmpOGE(FR, FB);
986
987 // jq = (cv ? jq : 0);
988 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
989
990 // dst = iq + jq;
991 Value *Div = Builder.CreateAdd(IQ, JQ);
992
993 Value *Res = Div;
994 if (!IsDiv) {
995 // Rem needs compensation, it's easier to recompute it
996 Value *Rem = Builder.CreateMul(Div, Den);
997 Res = Builder.CreateSub(Num, Rem);
998 }
999
1000 if (DivBits != 0 && DivBits < 32) {
1001 // Extend in register from the number of bits this divide really is.
1002 if (IsSigned) {
1003 int InRegBits = 32 - DivBits;
1004
1005 Res = Builder.CreateShl(Res, InRegBits);
1006 Res = Builder.CreateAShr(Res, InRegBits);
1007 } else {
1008 ConstantInt *TruncMask
1009 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1010 Res = Builder.CreateAnd(Res, TruncMask);
1011 }
1012 }
1013
1014 return Res;
1015}
1016
1017// Try to recognize special cases the DAG will emit special, better expansions
1018// than the general expansion we do here.
1019
1020// TODO: It would be better to just directly handle those optimizations here.
1021bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
1022 BinaryOperator &I, Value *Num, Value *Den) const {
1023 if (Constant *C = dyn_cast<Constant>(Den)) {
1024 // Arbitrary constants get a better expansion as long as a wider mulhi is
1025 // legal.
1026 if (C->getType()->getScalarSizeInBits() <= 32)
1027 return true;
1028
1029 // TODO: Sdiv check for not exact for some reason.
1030
1031 // If there's no wider mulhi, there's only a better expansion for powers of
1032 // two.
1033 // TODO: Should really know for each vector element.
1034 if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1035 return true;
1036
1037 return false;
1038 }
1039
1040 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1041 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1042 if (BinOpDen->getOpcode() == Instruction::Shl &&
1043 isa<Constant>(BinOpDen->getOperand(0)) &&
1044 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1045 0, AC, &I, DT)) {
1046 return true;
1047 }
1048 }
1049
1050 return false;
1051}
1052
1053static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1054 // Check whether the sign can be determined statically.
1055 KnownBits Known = computeKnownBits(V, *DL);
1056 if (Known.isNegative())
1057 return Constant::getAllOnesValue(V->getType());
1058 if (Known.isNonNegative())
1059 return Constant::getNullValue(V->getType());
1060 return Builder.CreateAShr(V, Builder.getInt32(31));
1061}
1062
1063Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
1065 Value *Y) const {
1066 Instruction::BinaryOps Opc = I.getOpcode();
1067 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1068 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1069
1070 FastMathFlags FMF;
1071 FMF.setFast();
1072 Builder.setFastMathFlags(FMF);
1073
1074 if (divHasSpecialOptimization(I, X, Y))
1075 return nullptr; // Keep it for later optimization.
1076
1077 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1078 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1079
1080 Type *Ty = X->getType();
1081 Type *I32Ty = Builder.getInt32Ty();
1082 Type *F32Ty = Builder.getFloatTy();
1083
1084 if (Ty->getScalarSizeInBits() < 32) {
1085 if (IsSigned) {
1086 X = Builder.CreateSExt(X, I32Ty);
1087 Y = Builder.CreateSExt(Y, I32Ty);
1088 } else {
1089 X = Builder.CreateZExt(X, I32Ty);
1090 Y = Builder.CreateZExt(Y, I32Ty);
1091 }
1092 }
1093
1094 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1095 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1096 Builder.CreateZExtOrTrunc(Res, Ty);
1097 }
1098
1099 ConstantInt *Zero = Builder.getInt32(0);
1100 ConstantInt *One = Builder.getInt32(1);
1101
1102 Value *Sign = nullptr;
1103 if (IsSigned) {
1104 Value *SignX = getSign32(X, Builder, DL);
1105 Value *SignY = getSign32(Y, Builder, DL);
1106 // Remainder sign is the same as LHS
1107 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1108
1109 X = Builder.CreateAdd(X, SignX);
1110 Y = Builder.CreateAdd(Y, SignY);
1111
1112 X = Builder.CreateXor(X, SignX);
1113 Y = Builder.CreateXor(Y, SignY);
1114 }
1115
1116 // The algorithm here is based on ideas from "Software Integer Division", Tom
1117 // Rodeheffer, August 2008.
1118 //
1119 // unsigned udiv(unsigned x, unsigned y) {
1120 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1121 // // that this is a lower bound on inv(y), even if some of the calculations
1122 // // round up.
1123 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1124 //
1125 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1126 // // Empirically this is guaranteed to give a "two-y" lower bound on
1127 // // inv(y).
1128 // z += umulh(z, -y * z);
1129 //
1130 // // Quotient/remainder estimate.
1131 // unsigned q = umulh(x, z);
1132 // unsigned r = x - q * y;
1133 //
1134 // // Two rounds of quotient/remainder refinement.
1135 // if (r >= y) {
1136 // ++q;
1137 // r -= y;
1138 // }
1139 // if (r >= y) {
1140 // ++q;
1141 // r -= y;
1142 // }
1143 //
1144 // return q;
1145 // }
1146
1147 // Initial estimate of inv(y).
1148 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1149 Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1150 Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1151 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1152 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1153 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1154
1155 // One round of UNR.
1156 Value *NegY = Builder.CreateSub(Zero, Y);
1157 Value *NegYZ = Builder.CreateMul(NegY, Z);
1158 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1159
1160 // Quotient/remainder estimate.
1161 Value *Q = getMulHu(Builder, X, Z);
1162 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1163
1164 // First quotient/remainder refinement.
1165 Value *Cond = Builder.CreateICmpUGE(R, Y);
1166 if (IsDiv)
1167 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1168 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1169
1170 // Second quotient/remainder refinement.
1171 Cond = Builder.CreateICmpUGE(R, Y);
1172 Value *Res;
1173 if (IsDiv)
1174 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1175 else
1176 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1177
1178 if (IsSigned) {
1179 Res = Builder.CreateXor(Res, Sign);
1180 Res = Builder.CreateSub(Res, Sign);
1181 }
1182
1183 Res = Builder.CreateTrunc(Res, Ty);
1184
1185 return Res;
1186}
1187
1188Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
1190 Value *Num, Value *Den) const {
1191 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1192 return nullptr; // Keep it for later optimization.
1193
1194 Instruction::BinaryOps Opc = I.getOpcode();
1195
1196 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1197 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1198
1199 int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1200 if (NumDivBits == -1)
1201 return nullptr;
1202
1203 Value *Narrowed = nullptr;
1204 if (NumDivBits <= 24) {
1205 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1206 IsDiv, IsSigned);
1207 } else if (NumDivBits <= 32) {
1208 Narrowed = expandDivRem32(Builder, I, Num, Den);
1209 }
1210
1211 if (Narrowed) {
1212 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1213 Builder.CreateZExt(Narrowed, Num->getType());
1214 }
1215
1216 return nullptr;
1217}
1218
1219void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
1220 Instruction::BinaryOps Opc = I.getOpcode();
1221 // Do the general expansion.
1222 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1224 return;
1225 }
1226
1227 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1229 return;
1230 }
1231
1232 llvm_unreachable("not a division");
1233}
1234
1235bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
1236 if (foldBinOpIntoSelect(I))
1237 return true;
1238
1239 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1240 UA->isUniform(&I) && promoteUniformOpToI32(I))
1241 return true;
1242
1243 if (UseMul24Intrin && replaceMulWithMul24(I))
1244 return true;
1245
1246 bool Changed = false;
1247 Instruction::BinaryOps Opc = I.getOpcode();
1248 Type *Ty = I.getType();
1249 Value *NewDiv = nullptr;
1250 unsigned ScalarSize = Ty->getScalarSizeInBits();
1251
1253
1254 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1255 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1256 ScalarSize <= 64 &&
1257 !DisableIDivExpand) {
1258 Value *Num = I.getOperand(0);
1259 Value *Den = I.getOperand(1);
1261 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1262
1263 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1264 NewDiv = PoisonValue::get(VT);
1265
1266 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1267 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1268 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1269
1270 Value *NewElt;
1271 if (ScalarSize <= 32) {
1272 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1273 if (!NewElt)
1274 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1275 } else {
1276 // See if this 64-bit division can be shrunk to 32/24-bits before
1277 // producing the general expansion.
1278 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1279 if (!NewElt) {
1280 // The general 64-bit expansion introduces control flow and doesn't
1281 // return the new value. Just insert a scalar copy and defer
1282 // expanding it.
1283 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1284 Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1285 }
1286 }
1287
1288 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1289 }
1290 } else {
1291 if (ScalarSize <= 32)
1292 NewDiv = expandDivRem32(Builder, I, Num, Den);
1293 else {
1294 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1295 if (!NewDiv)
1296 Div64ToExpand.push_back(&I);
1297 }
1298 }
1299
1300 if (NewDiv) {
1301 I.replaceAllUsesWith(NewDiv);
1302 I.eraseFromParent();
1303 Changed = true;
1304 }
1305 }
1306
1307 if (ExpandDiv64InIR) {
1308 // TODO: We get much worse code in specially handled constant cases.
1309 for (BinaryOperator *Div : Div64ToExpand) {
1310 expandDivRem64(*Div);
1311 Changed = true;
1312 }
1313 }
1314
1315 return Changed;
1316}
1317
1318bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
1319 if (!WidenLoads)
1320 return false;
1321
1322 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1323 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1324 canWidenScalarExtLoad(I)) {
1326 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1327
1328 Type *I32Ty = Builder.getInt32Ty();
1329 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
1330 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
1331 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
1332 WidenLoad->copyMetadata(I);
1333
1334 // If we have range metadata, we need to convert the type, and not make
1335 // assumptions about the high bits.
1336 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1338 mdconst::extract<ConstantInt>(Range->getOperand(0));
1339
1340 if (Lower->isNullValue()) {
1341 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1342 } else {
1343 Metadata *LowAndHigh[] = {
1344 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1345 // Don't make assumptions about the high bits.
1347 };
1348
1349 WidenLoad->setMetadata(LLVMContext::MD_range,
1351 }
1352 }
1353
1354 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1355 Type *IntNTy = Builder.getIntNTy(TySize);
1356 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1357 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1358 I.replaceAllUsesWith(ValOrig);
1359 I.eraseFromParent();
1360 return true;
1361 }
1362
1363 return false;
1364}
1365
1366bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
1367 bool Changed = false;
1368
1369 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1370 UA->isUniform(&I))
1371 Changed |= promoteUniformOpToI32(I);
1372
1373 return Changed;
1374}
1375
1376bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1377 bool Changed = false;
1378
1379 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1380 UA->isUniform(&I))
1381 Changed |= promoteUniformOpToI32(I);
1382
1383 return Changed;
1384}
1385
1386bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
1387 switch (I.getIntrinsicID()) {
1388 case Intrinsic::bitreverse:
1389 return visitBitreverseIntrinsicInst(I);
1390 default:
1391 return false;
1392 }
1393}
1394
1395bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
1396 bool Changed = false;
1397
1398 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1399 UA->isUniform(&I))
1400 Changed |= promoteUniformBitreverseToI32(I);
1401
1402 return Changed;
1403}
1404
1405bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
1406 Mod = &M;
1407 DL = &Mod->getDataLayout();
1408 return false;
1409}
1410
1411bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
1412 if (skipFunction(F))
1413 return false;
1414
1415 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1416 if (!TPC)
1417 return false;
1418
1419 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
1420 ST = &TM.getSubtarget<GCNSubtarget>(F);
1421 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1422 UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
1423
1424 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1425 DT = DTWP ? &DTWP->getDomTree() : nullptr;
1426
1427 HasUnsafeFPMath = hasUnsafeFPMath(F);
1428
1430 HasFP32Denormals = Mode.allFP32Denormals();
1431
1432 bool MadeChange = false;
1433
1434 Function::iterator NextBB;
1435 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
1436 BasicBlock *BB = &*FI;
1437 NextBB = std::next(FI);
1438
1440 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
1441 Next = std::next(I);
1442
1443 MadeChange |= visit(*I);
1444
1445 if (Next != E) { // Control flow changed
1446 BasicBlock *NextInstBB = Next->getParent();
1447 if (NextInstBB != BB) {
1448 BB = NextInstBB;
1449 E = BB->end();
1450 FE = F.end();
1451 }
1452 }
1453 }
1454 }
1455
1456 return MadeChange;
1457}
1458
1459INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
1460 "AMDGPU IR optimizations", false, false)
1465
1466char AMDGPUCodeGenPrepare::ID = 0;
1467
1469 return new AMDGPUCodeGenPrepare();
1470}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, bool HasDenormals, IRBuilder<> &Builder, Module *Mod)
AMDGPU IR optimizations
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool hasUnsafeFPMath(const Function &F)
static Value * getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, unsigned Size, unsigned NumBits, bool IsSigned)
#define DEBUG_TYPE
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
static Value * optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, bool RcpIsAccurate, IRBuilder<> &Builder, Module *Mod)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
The AMDGPU TargetMachine interface definition for hw codegen targets.
assume Assume Builder
SmallVector< MachineOperand, 4 > Cond
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Statically lint checks LLVM IR
Definition: Lint.cpp:746
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Metadata * LowAndHigh[]
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
Module * Mod
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:303
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
iterator end()
Definition: BasicBlock.h:316
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:314
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
BinaryOps getOpcode() const
Definition: InstrTypes.h:391
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:428
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:675
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:256
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:935
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:403
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:356
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:669
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:170
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:288
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
void setFast(bool B=true)
Definition: FMF.h:98
bool approxFunc() const
Definition: FMF.h:72
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:704
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
Definition: Function.h:65
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2558
Base class for instruction visitors.
Definition: InstVisitor.h:78
RetTy visitIntrinsicInst(IntrinsicInst &I)
Definition: InstVisitor.h:219
RetTy visitBinaryOperator(BinaryOperator &I)
Definition: InstVisitor.h:261
RetTy visitICmpInst(ICmpInst &I)
Definition: InstVisitor.h:166
RetTy visitSelectInst(SelectInst &I)
Definition: InstVisitor.h:189
void visitInstruction(Instruction &I)
Definition: InstVisitor.h:280
RetTy visitLoadInst(LoadInst &I)
Definition: InstVisitor.h:169
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:358
const BasicBlock * getParent() const
Definition: Instruction.h:90
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:275
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1455
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:82
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
An instruction for reading from memory.
Definition: Instructions.h:177
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1399
Root of the metadata hierarchy.
Definition: Metadata.h:61
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:262
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
Definition: Pass.h:116
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1758
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:350
Legacy analysis pass which computes a CycleInfo.
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
self_iterator getIterator()
Definition: ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:384
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:380
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:486
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1506
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
FunctionPass * createAMDGPUCodeGenPreparePass()
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:99
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:96