LLVM  13.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
22 #include "llvm/IR/Dominators.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/InitializePasses.h"
26 #include "llvm/Pass.h"
27 #include "llvm/Support/KnownBits.h"
29 
30 #define DEBUG_TYPE "amdgpu-codegenprepare"
31 
32 using namespace llvm;
33 
34 namespace {
35 
37  "amdgpu-codegenprepare-widen-constant-loads",
38  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
40  cl::init(false));
41 
42 static cl::opt<bool> Widen16BitOps(
43  "amdgpu-codegenprepare-widen-16-bit-ops",
44  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
46  cl::init(true));
47 
48 static cl::opt<bool> UseMul24Intrin(
49  "amdgpu-codegenprepare-mul24",
50  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
52  cl::init(true));
53 
54 // Legalize 64-bit division by using the generic IR expansion.
55 static cl::opt<bool> ExpandDiv64InIR(
56  "amdgpu-codegenprepare-expand-div64",
57  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
59  cl::init(false));
60 
61 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
62 // and is used for testing the legalizer.
63 static cl::opt<bool> DisableIDivExpand(
64  "amdgpu-codegenprepare-disable-idiv-expansion",
65  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
67  cl::init(false));
68 
69 class AMDGPUCodeGenPrepare : public FunctionPass,
70  public InstVisitor<AMDGPUCodeGenPrepare, bool> {
71  const GCNSubtarget *ST = nullptr;
72  AssumptionCache *AC = nullptr;
73  DominatorTree *DT = nullptr;
74  LegacyDivergenceAnalysis *DA = nullptr;
75  Module *Mod = nullptr;
76  const DataLayout *DL = nullptr;
77  bool HasUnsafeFPMath = false;
78  bool HasFP32Denormals = false;
79 
80  /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
81  /// binary operation \p V.
82  ///
83  /// \returns Binary operation \p V.
84  /// \returns \p T's base element bit width.
85  unsigned getBaseElementBitWidth(const Type *T) const;
86 
87  /// \returns Equivalent 32 bit integer type for given type \p T. For example,
88  /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
89  /// is returned.
90  Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
91 
92  /// \returns True if binary operation \p I is a signed binary operation, false
93  /// otherwise.
94  bool isSigned(const BinaryOperator &I) const;
95 
96  /// \returns True if the condition of 'select' operation \p I comes from a
97  /// signed 'icmp' operation, false otherwise.
98  bool isSigned(const SelectInst &I) const;
99 
100  /// \returns True if type \p T needs to be promoted to 32 bit integer type,
101  /// false otherwise.
102  bool needsPromotionToI32(const Type *T) const;
103 
104  /// Promotes uniform binary operation \p I to equivalent 32 bit binary
105  /// operation.
106  ///
107  /// \details \p I's base element bit width must be greater than 1 and less
108  /// than or equal 16. Promotion is done by sign or zero extending operands to
109  /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
110  /// truncating the result of 32 bit binary operation back to \p I's original
111  /// type. Division operation is not promoted.
112  ///
113  /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
114  /// false otherwise.
115  bool promoteUniformOpToI32(BinaryOperator &I) const;
116 
117  /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
118  ///
119  /// \details \p I's base element bit width must be greater than 1 and less
120  /// than or equal 16. Promotion is done by sign or zero extending operands to
121  /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
122  ///
123  /// \returns True.
124  bool promoteUniformOpToI32(ICmpInst &I) const;
125 
126  /// Promotes uniform 'select' operation \p I to 32 bit 'select'
127  /// operation.
128  ///
129  /// \details \p I's base element bit width must be greater than 1 and less
130  /// than or equal 16. Promotion is done by sign or zero extending operands to
131  /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
132  /// result of 32 bit 'select' operation back to \p I's original type.
133  ///
134  /// \returns True.
135  bool promoteUniformOpToI32(SelectInst &I) const;
136 
137  /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
138  /// intrinsic.
139  ///
140  /// \details \p I's base element bit width must be greater than 1 and less
141  /// than or equal 16. Promotion is done by zero extending the operand to 32
142  /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
143  /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
144  /// shift amount is 32 minus \p I's base element bit width), and truncating
145  /// the result of the shift operation back to \p I's original type.
146  ///
147  /// \returns True.
148  bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
149 
150 
151  unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
152  unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
153  bool isI24(Value *V, unsigned ScalarSize) const;
154  bool isU24(Value *V, unsigned ScalarSize) const;
155 
156  /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
157  /// SelectionDAG has an issue where an and asserting the bits are known
158  bool replaceMulWithMul24(BinaryOperator &I) const;
159 
160  /// Perform same function as equivalently named function in DAGCombiner. Since
161  /// we expand some divisions here, we need to perform this before obscuring.
162  bool foldBinOpIntoSelect(BinaryOperator &I) const;
163 
164  bool divHasSpecialOptimization(BinaryOperator &I,
165  Value *Num, Value *Den) const;
166  int getDivNumBits(BinaryOperator &I,
167  Value *Num, Value *Den,
168  unsigned AtLeast, bool Signed) const;
169 
170  /// Expands 24 bit div or rem.
171  Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
172  Value *Num, Value *Den,
173  bool IsDiv, bool IsSigned) const;
174 
175  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
176  Value *Num, Value *Den, unsigned NumBits,
177  bool IsDiv, bool IsSigned) const;
178 
179  /// Expands 32 bit div or rem.
180  Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
181  Value *Num, Value *Den) const;
182 
183  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
184  Value *Num, Value *Den) const;
185  void expandDivRem64(BinaryOperator &I) const;
186 
187  /// Widen a scalar load.
188  ///
189  /// \details \p Widen scalar load for uniform, small type loads from constant
190  // memory / to a full 32-bits and then truncate the input to allow a scalar
191  // load instead of a vector load.
192  //
193  /// \returns True.
194 
195  bool canWidenScalarExtLoad(LoadInst &I) const;
196 
197 public:
198  static char ID;
199 
200  AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
201 
202  bool visitFDiv(BinaryOperator &I);
203 
204  bool visitInstruction(Instruction &I) { return false; }
205  bool visitBinaryOperator(BinaryOperator &I);
206  bool visitLoadInst(LoadInst &I);
207  bool visitICmpInst(ICmpInst &I);
208  bool visitSelectInst(SelectInst &I);
209 
210  bool visitIntrinsicInst(IntrinsicInst &I);
211  bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
212 
213  bool doInitialization(Module &M) override;
214  bool runOnFunction(Function &F) override;
215 
216  StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
217 
218  void getAnalysisUsage(AnalysisUsage &AU) const override {
221 
222  // FIXME: Division expansion needs to preserve the dominator tree.
223  if (!ExpandDiv64InIR)
224  AU.setPreservesAll();
225  }
226 };
227 
228 } // end anonymous namespace
229 
230 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
231  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
232 
233  if (T->isIntegerTy())
234  return T->getIntegerBitWidth();
235  return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
236 }
237 
238 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
239  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
240 
241  if (T->isIntegerTy())
242  return B.getInt32Ty();
243  return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
244 }
245 
246 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
247  return I.getOpcode() == Instruction::AShr ||
248  I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
249 }
250 
251 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
252  return isa<ICmpInst>(I.getOperand(0)) ?
253  cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
254 }
255 
256 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
257  if (!Widen16BitOps)
258  return false;
259 
260  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
261  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
262  return true;
263 
264  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
265  // TODO: The set of packed operations is more limited, so may want to
266  // promote some anyway.
267  if (ST->hasVOP3PInsts())
268  return false;
269 
270  return needsPromotionToI32(VT->getElementType());
271  }
272 
273  return false;
274 }
275 
276 // Return true if the op promoted to i32 should have nsw set.
277 static bool promotedOpIsNSW(const Instruction &I) {
278  switch (I.getOpcode()) {
279  case Instruction::Shl:
280  case Instruction::Add:
281  case Instruction::Sub:
282  return true;
283  case Instruction::Mul:
284  return I.hasNoUnsignedWrap();
285  default:
286  return false;
287  }
288 }
289 
290 // Return true if the op promoted to i32 should have nuw set.
291 static bool promotedOpIsNUW(const Instruction &I) {
292  switch (I.getOpcode()) {
293  case Instruction::Shl:
294  case Instruction::Add:
295  case Instruction::Mul:
296  return true;
297  case Instruction::Sub:
298  return I.hasNoUnsignedWrap();
299  default:
300  return false;
301  }
302 }
303 
304 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
305  Type *Ty = I.getType();
306  const DataLayout &DL = Mod->getDataLayout();
307  int TySize = DL.getTypeSizeInBits(Ty);
308  Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
309 
310  return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
311 }
312 
313 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
314  assert(needsPromotionToI32(I.getType()) &&
315  "I does not need promotion to i32");
316 
317  if (I.getOpcode() == Instruction::SDiv ||
318  I.getOpcode() == Instruction::UDiv ||
319  I.getOpcode() == Instruction::SRem ||
320  I.getOpcode() == Instruction::URem)
321  return false;
322 
324  Builder.SetCurrentDebugLocation(I.getDebugLoc());
325 
326  Type *I32Ty = getI32Ty(Builder, I.getType());
327  Value *ExtOp0 = nullptr;
328  Value *ExtOp1 = nullptr;
329  Value *ExtRes = nullptr;
330  Value *TruncRes = nullptr;
331 
332  if (isSigned(I)) {
333  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
334  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
335  } else {
336  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
337  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
338  }
339 
340  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
341  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
342  if (promotedOpIsNSW(cast<Instruction>(I)))
343  Inst->setHasNoSignedWrap();
344 
345  if (promotedOpIsNUW(cast<Instruction>(I)))
346  Inst->setHasNoUnsignedWrap();
347 
348  if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
349  Inst->setIsExact(ExactOp->isExact());
350  }
351 
352  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
353 
354  I.replaceAllUsesWith(TruncRes);
355  I.eraseFromParent();
356 
357  return true;
358 }
359 
360 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
361  assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
362  "I does not need promotion to i32");
363 
365  Builder.SetCurrentDebugLocation(I.getDebugLoc());
366 
367  Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
368  Value *ExtOp0 = nullptr;
369  Value *ExtOp1 = nullptr;
370  Value *NewICmp = nullptr;
371 
372  if (I.isSigned()) {
373  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
374  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
375  } else {
376  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
377  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
378  }
379  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
380 
381  I.replaceAllUsesWith(NewICmp);
382  I.eraseFromParent();
383 
384  return true;
385 }
386 
387 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
388  assert(needsPromotionToI32(I.getType()) &&
389  "I does not need promotion to i32");
390 
392  Builder.SetCurrentDebugLocation(I.getDebugLoc());
393 
394  Type *I32Ty = getI32Ty(Builder, I.getType());
395  Value *ExtOp1 = nullptr;
396  Value *ExtOp2 = nullptr;
397  Value *ExtRes = nullptr;
398  Value *TruncRes = nullptr;
399 
400  if (isSigned(I)) {
401  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
402  ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
403  } else {
404  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
405  ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
406  }
407  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
408  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
409 
410  I.replaceAllUsesWith(TruncRes);
411  I.eraseFromParent();
412 
413  return true;
414 }
415 
416 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
417  IntrinsicInst &I) const {
418  assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
419  "I must be bitreverse intrinsic");
420  assert(needsPromotionToI32(I.getType()) &&
421  "I does not need promotion to i32");
422 
424  Builder.SetCurrentDebugLocation(I.getDebugLoc());
425 
426  Type *I32Ty = getI32Ty(Builder, I.getType());
427  Function *I32 =
428  Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
429  Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
430  Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
431  Value *LShrOp =
432  Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
433  Value *TruncRes =
434  Builder.CreateTrunc(LShrOp, I.getType());
435 
436  I.replaceAllUsesWith(TruncRes);
437  I.eraseFromParent();
438 
439  return true;
440 }
441 
442 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
443  unsigned ScalarSize) const {
444  KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
445  return ScalarSize - Known.countMinLeadingZeros();
446 }
447 
448 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
449  unsigned ScalarSize) const {
450  // In order for this to be a signed 24-bit value, bit 23, must
451  // be a sign bit.
452  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
453 }
454 
455 bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
456  return ScalarSize >= 24 && // Types less than 24-bit should be treated
457  // as unsigned 24-bit values.
458  numBitsSigned(V, ScalarSize) < 24;
459 }
460 
461 bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
462  return numBitsUnsigned(V, ScalarSize) <= 24;
463 }
464 
466  SmallVectorImpl<Value *> &Values, Value *V) {
467  auto *VT = dyn_cast<FixedVectorType>(V->getType());
468  if (!VT) {
469  Values.push_back(V);
470  return;
471  }
472 
473  for (int I = 0, E = VT->getNumElements(); I != E; ++I)
474  Values.push_back(Builder.CreateExtractElement(V, I));
475 }
476 
478  Type *Ty,
479  SmallVectorImpl<Value *> &Values) {
480  if (Values.size() == 1)
481  return Values[0];
482 
483  Value *NewVal = UndefValue::get(Ty);
484  for (int I = 0, E = Values.size(); I != E; ++I)
485  NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
486 
487  return NewVal;
488 }
489 
490 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
491  if (I.getOpcode() != Instruction::Mul)
492  return false;
493 
494  Type *Ty = I.getType();
495  unsigned Size = Ty->getScalarSizeInBits();
496  if (Size <= 16 && ST->has16BitInsts())
497  return false;
498 
499  // Prefer scalar if this could be s_mul_i32
500  if (DA->isUniform(&I))
501  return false;
502 
503  Value *LHS = I.getOperand(0);
504  Value *RHS = I.getOperand(1);
506  Builder.SetCurrentDebugLocation(I.getDebugLoc());
507 
509 
510  // TODO: Should this try to match mulhi24?
511  if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
512  IntrID = Intrinsic::amdgcn_mul_u24;
513  } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
514  IntrID = Intrinsic::amdgcn_mul_i24;
515  } else
516  return false;
517 
518  SmallVector<Value *, 4> LHSVals;
519  SmallVector<Value *, 4> RHSVals;
520  SmallVector<Value *, 4> ResultVals;
521  extractValues(Builder, LHSVals, LHS);
522  extractValues(Builder, RHSVals, RHS);
523 
524 
525  IntegerType *I32Ty = Builder.getInt32Ty();
526  FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
527  for (int I = 0, E = LHSVals.size(); I != E; ++I) {
528  Value *LHS, *RHS;
529  if (IntrID == Intrinsic::amdgcn_mul_u24) {
530  LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
531  RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
532  } else {
533  LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
534  RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
535  }
536 
537  Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
538 
539  if (IntrID == Intrinsic::amdgcn_mul_u24) {
540  ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
541  LHSVals[I]->getType()));
542  } else {
543  ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
544  LHSVals[I]->getType()));
545  }
546  }
547 
548  Value *NewVal = insertValues(Builder, Ty, ResultVals);
549  NewVal->takeName(&I);
550  I.replaceAllUsesWith(NewVal);
551  I.eraseFromParent();
552 
553  return true;
554 }
555 
556 // Find a select instruction, which may have been casted. This is mostly to deal
557 // with cases where i16 selects were promoted here to i32.
559  Cast = nullptr;
560  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
561  return Sel;
562 
563  if ((Cast = dyn_cast<CastInst>(V))) {
564  if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
565  return Sel;
566  }
567 
568  return nullptr;
569 }
570 
571 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
572  // Don't do this unless the old select is going away. We want to eliminate the
573  // binary operator, not replace a binop with a select.
574  int SelOpNo = 0;
575 
576  CastInst *CastOp;
577 
578  // TODO: Should probably try to handle some cases with multiple
579  // users. Duplicating the select may be profitable for division.
580  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
581  if (!Sel || !Sel->hasOneUse()) {
582  SelOpNo = 1;
583  Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
584  }
585 
586  if (!Sel || !Sel->hasOneUse())
587  return false;
588 
589  Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
590  Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
591  Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
592  if (!CBO || !CT || !CF)
593  return false;
594 
595  if (CastOp) {
596  if (!CastOp->hasOneUse())
597  return false;
598  CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
599  CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
600  }
601 
602  // TODO: Handle special 0/-1 cases DAG combine does, although we only really
603  // need to handle divisions here.
604  Constant *FoldedT = SelOpNo ?
605  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
606  ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
607  if (isa<ConstantExpr>(FoldedT))
608  return false;
609 
610  Constant *FoldedF = SelOpNo ?
611  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
612  ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
613  if (isa<ConstantExpr>(FoldedF))
614  return false;
615 
616  IRBuilder<> Builder(&BO);
617  Builder.SetCurrentDebugLocation(BO.getDebugLoc());
618  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
619  Builder.setFastMathFlags(FPOp->getFastMathFlags());
620 
621  Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
622  FoldedT, FoldedF);
623  NewSelect->takeName(&BO);
624  BO.replaceAllUsesWith(NewSelect);
625  BO.eraseFromParent();
626  if (CastOp)
627  CastOp->eraseFromParent();
628  Sel->eraseFromParent();
629  return true;
630 }
631 
632 // Optimize fdiv with rcp:
633 //
634 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
635 // allowed with unsafe-fp-math or afn.
636 //
637 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
638 static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
639  bool RcpIsAccurate, IRBuilder<> &Builder,
640  Module *Mod) {
641 
642  if (!AllowInaccurateRcp && !RcpIsAccurate)
643  return nullptr;
644 
645  Type *Ty = Den->getType();
646  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
647  if (AllowInaccurateRcp || RcpIsAccurate) {
648  if (CLHS->isExactlyValue(1.0)) {
650  Mod, Intrinsic::amdgcn_rcp, Ty);
651 
652  // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
653  // the CI documentation has a worst case error of 1 ulp.
654  // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
655  // use it as long as we aren't trying to use denormals.
656  //
657  // v_rcp_f16 and v_rsq_f16 DO support denormals.
658 
659  // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
660  // insert rsq intrinsic here.
661 
662  // 1.0 / x -> rcp(x)
663  return Builder.CreateCall(Decl, { Den });
664  }
665 
666  // Same as for 1.0, but expand the sign out of the constant.
667  if (CLHS->isExactlyValue(-1.0)) {
669  Mod, Intrinsic::amdgcn_rcp, Ty);
670 
671  // -1.0 / x -> rcp (fneg x)
672  Value *FNeg = Builder.CreateFNeg(Den);
673  return Builder.CreateCall(Decl, { FNeg });
674  }
675  }
676  }
677 
678  if (AllowInaccurateRcp) {
680  Mod, Intrinsic::amdgcn_rcp, Ty);
681 
682  // Turn into multiply by the reciprocal.
683  // x / y -> x * (1.0 / y)
684  Value *Recip = Builder.CreateCall(Decl, { Den });
685  return Builder.CreateFMul(Num, Recip);
686  }
687  return nullptr;
688 }
689 
690 // optimize with fdiv.fast:
691 //
692 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
693 //
694 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
695 //
696 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
697 static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
698  bool HasDenormals, IRBuilder<> &Builder,
699  Module *Mod) {
700  // fdiv.fast can achieve 2.5 ULP accuracy.
701  if (ReqdAccuracy < 2.5f)
702  return nullptr;
703 
704  // Only have fdiv.fast for f32.
705  Type *Ty = Den->getType();
706  if (!Ty->isFloatTy())
707  return nullptr;
708 
709  bool NumIsOne = false;
710  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
711  if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
712  NumIsOne = true;
713  }
714 
715  // fdiv does not support denormals. But 1.0/x is always fine to use it.
716  if (HasDenormals && !NumIsOne)
717  return nullptr;
718 
719  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
720  return Builder.CreateCall(Decl, { Num, Den });
721 }
722 
723 // Optimizations is performed based on fpmath, fast math flags as well as
724 // denormals to optimize fdiv with either rcp or fdiv.fast.
725 //
726 // With rcp:
727 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
728 // allowed with unsafe-fp-math or afn.
729 //
730 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
731 //
732 // With fdiv.fast:
733 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
734 //
735 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
736 //
737 // NOTE: rcp is the preference in cases that both are legal.
738 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
739 
740  Type *Ty = FDiv.getType()->getScalarType();
741 
742  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
743  // expansion around them in codegen.
744  if (Ty->isDoubleTy())
745  return false;
746 
747  // No intrinsic for fdiv16 if target does not support f16.
748  if (Ty->isHalfTy() && !ST->has16BitInsts())
749  return false;
750 
751  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
752  const float ReqdAccuracy = FPOp->getFPAccuracy();
753 
754  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
755  FastMathFlags FMF = FPOp->getFastMathFlags();
756  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
757 
758  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
759  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
760  // rcp_f64 is never accurate.
761  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
762  (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
763 
764  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
765  Builder.setFastMathFlags(FMF);
766  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
767 
768  Value *Num = FDiv.getOperand(0);
769  Value *Den = FDiv.getOperand(1);
770 
771  Value *NewFDiv = nullptr;
772  if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
773  NewFDiv = UndefValue::get(VT);
774 
775  // FIXME: Doesn't do the right thing for cases where the vector is partially
776  // constant. This works when the scalarizer pass is run first.
777  for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
778  Value *NumEltI = Builder.CreateExtractElement(Num, I);
779  Value *DenEltI = Builder.CreateExtractElement(Den, I);
780  // Try rcp first.
781  Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
782  RcpIsAccurate, Builder, Mod);
783  if (!NewElt) // Try fdiv.fast.
784  NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
785  HasFP32Denormals, Builder, Mod);
786  if (!NewElt) // Keep the original.
787  NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
788 
789  NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
790  }
791  } else { // Scalar FDiv.
792  // Try rcp first.
793  NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
794  Builder, Mod);
795  if (!NewFDiv) { // Try fdiv.fast.
796  NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
797  Builder, Mod);
798  }
799  }
800 
801  if (NewFDiv) {
802  FDiv.replaceAllUsesWith(NewFDiv);
803  NewFDiv->takeName(&FDiv);
804  FDiv.eraseFromParent();
805  }
806 
807  return !!NewFDiv;
808 }
809 
810 static bool hasUnsafeFPMath(const Function &F) {
811  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
812  return Attr.getValueAsBool();
813 }
814 
815 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
816  Value *LHS, Value *RHS) {
817  Type *I32Ty = Builder.getInt32Ty();
818  Type *I64Ty = Builder.getInt64Ty();
819 
820  Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
821  Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
822  Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
823  Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
824  Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
825  Hi = Builder.CreateTrunc(Hi, I32Ty);
826  return std::make_pair(Lo, Hi);
827 }
828 
829 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
830  return getMul64(Builder, LHS, RHS).second;
831 }
832 
833 /// Figure out how many bits are really needed for this ddivision. \p AtLeast is
834 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
835 /// first one is insufficient. Returns -1 on failure.
836 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
837  Value *Num, Value *Den,
838  unsigned AtLeast, bool IsSigned) const {
839  const DataLayout &DL = Mod->getDataLayout();
840  unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
841  if (LHSSignBits < AtLeast)
842  return -1;
843 
844  unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
845  if (RHSSignBits < AtLeast)
846  return -1;
847 
848  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
849  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
850  if (IsSigned)
851  ++DivBits;
852  return DivBits;
853 }
854 
855 // The fractional part of a float is enough to accurately represent up to
856 // a 24-bit signed integer.
857 Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
858  BinaryOperator &I,
859  Value *Num, Value *Den,
860  bool IsDiv, bool IsSigned) const {
861  int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
862  if (DivBits == -1)
863  return nullptr;
864  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
865 }
866 
867 Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
868  BinaryOperator &I,
869  Value *Num, Value *Den,
870  unsigned DivBits,
871  bool IsDiv, bool IsSigned) const {
872  Type *I32Ty = Builder.getInt32Ty();
873  Num = Builder.CreateTrunc(Num, I32Ty);
874  Den = Builder.CreateTrunc(Den, I32Ty);
875 
876  Type *F32Ty = Builder.getFloatTy();
877  ConstantInt *One = Builder.getInt32(1);
878  Value *JQ = One;
879 
880  if (IsSigned) {
881  // char|short jq = ia ^ ib;
882  JQ = Builder.CreateXor(Num, Den);
883 
884  // jq = jq >> (bitsize - 2)
885  JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
886 
887  // jq = jq | 0x1
888  JQ = Builder.CreateOr(JQ, One);
889  }
890 
891  // int ia = (int)LHS;
892  Value *IA = Num;
893 
894  // int ib, (int)RHS;
895  Value *IB = Den;
896 
897  // float fa = (float)ia;
898  Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
899  : Builder.CreateUIToFP(IA, F32Ty);
900 
901  // float fb = (float)ib;
902  Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
903  : Builder.CreateUIToFP(IB,F32Ty);
904 
905  Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
906  Builder.getFloatTy());
907  Value *RCP = Builder.CreateCall(RcpDecl, { FB });
908  Value *FQM = Builder.CreateFMul(FA, RCP);
909 
910  // fq = trunc(fqm);
911  CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
912  FQ->copyFastMathFlags(Builder.getFastMathFlags());
913 
914  // float fqneg = -fq;
915  Value *FQNeg = Builder.CreateFNeg(FQ);
916 
917  // float fr = mad(fqneg, fb, fa);
918  auto FMAD = !ST->hasMadMacF32Insts()
919  ? Intrinsic::fma
920  : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
921  Value *FR = Builder.CreateIntrinsic(FMAD,
922  {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
923 
924  // int iq = (int)fq;
925  Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
926  : Builder.CreateFPToUI(FQ, I32Ty);
927 
928  // fr = fabs(fr);
929  FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
930 
931  // fb = fabs(fb);
932  FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
933 
934  // int cv = fr >= fb;
935  Value *CV = Builder.CreateFCmpOGE(FR, FB);
936 
937  // jq = (cv ? jq : 0);
938  JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
939 
940  // dst = iq + jq;
941  Value *Div = Builder.CreateAdd(IQ, JQ);
942 
943  Value *Res = Div;
944  if (!IsDiv) {
945  // Rem needs compensation, it's easier to recompute it
946  Value *Rem = Builder.CreateMul(Div, Den);
947  Res = Builder.CreateSub(Num, Rem);
948  }
949 
950  if (DivBits != 0 && DivBits < 32) {
951  // Extend in register from the number of bits this divide really is.
952  if (IsSigned) {
953  int InRegBits = 32 - DivBits;
954 
955  Res = Builder.CreateShl(Res, InRegBits);
956  Res = Builder.CreateAShr(Res, InRegBits);
957  } else {
958  ConstantInt *TruncMask
959  = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
960  Res = Builder.CreateAnd(Res, TruncMask);
961  }
962  }
963 
964  return Res;
965 }
966 
967 // Try to recognize special cases the DAG will emit special, better expansions
968 // than the general expansion we do here.
969 
970 // TODO: It would be better to just directly handle those optimizations here.
971 bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
972  BinaryOperator &I, Value *Num, Value *Den) const {
973  if (Constant *C = dyn_cast<Constant>(Den)) {
974  // Arbitrary constants get a better expansion as long as a wider mulhi is
975  // legal.
976  if (C->getType()->getScalarSizeInBits() <= 32)
977  return true;
978 
979  // TODO: Sdiv check for not exact for some reason.
980 
981  // If there's no wider mulhi, there's only a better expansion for powers of
982  // two.
983  // TODO: Should really know for each vector element.
984  if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
985  return true;
986 
987  return false;
988  }
989 
990  if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
991  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
992  if (BinOpDen->getOpcode() == Instruction::Shl &&
993  isa<Constant>(BinOpDen->getOperand(0)) &&
994  isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
995  0, AC, &I, DT)) {
996  return true;
997  }
998  }
999 
1000  return false;
1001 }
1002 
1004  // Check whether the sign can be determined statically.
1005  KnownBits Known = computeKnownBits(V, *DL);
1006  if (Known.isNegative())
1007  return Constant::getAllOnesValue(V->getType());
1008  if (Known.isNonNegative())
1009  return Constant::getNullValue(V->getType());
1010  return Builder.CreateAShr(V, Builder.getInt32(31));
1011 }
1012 
1013 Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
1014  BinaryOperator &I, Value *X,
1015  Value *Y) const {
1016  Instruction::BinaryOps Opc = I.getOpcode();
1017  assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1018  Opc == Instruction::SRem || Opc == Instruction::SDiv);
1019 
1020  FastMathFlags FMF;
1021  FMF.setFast();
1022  Builder.setFastMathFlags(FMF);
1023 
1024  if (divHasSpecialOptimization(I, X, Y))
1025  return nullptr; // Keep it for later optimization.
1026 
1027  bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1028  bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1029 
1030  Type *Ty = X->getType();
1031  Type *I32Ty = Builder.getInt32Ty();
1032  Type *F32Ty = Builder.getFloatTy();
1033 
1034  if (Ty->getScalarSizeInBits() < 32) {
1035  if (IsSigned) {
1036  X = Builder.CreateSExt(X, I32Ty);
1037  Y = Builder.CreateSExt(Y, I32Ty);
1038  } else {
1039  X = Builder.CreateZExt(X, I32Ty);
1040  Y = Builder.CreateZExt(Y, I32Ty);
1041  }
1042  }
1043 
1044  if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1045  return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1046  Builder.CreateZExtOrTrunc(Res, Ty);
1047  }
1048 
1049  ConstantInt *Zero = Builder.getInt32(0);
1050  ConstantInt *One = Builder.getInt32(1);
1051 
1052  Value *Sign = nullptr;
1053  if (IsSigned) {
1054  Value *SignX = getSign32(X, Builder, DL);
1055  Value *SignY = getSign32(Y, Builder, DL);
1056  // Remainder sign is the same as LHS
1057  Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1058 
1059  X = Builder.CreateAdd(X, SignX);
1060  Y = Builder.CreateAdd(Y, SignY);
1061 
1062  X = Builder.CreateXor(X, SignX);
1063  Y = Builder.CreateXor(Y, SignY);
1064  }
1065 
1066  // The algorithm here is based on ideas from "Software Integer Division", Tom
1067  // Rodeheffer, August 2008.
1068  //
1069  // unsigned udiv(unsigned x, unsigned y) {
1070  // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1071  // // that this is a lower bound on inv(y), even if some of the calculations
1072  // // round up.
1073  // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1074  //
1075  // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1076  // // Empirically this is guaranteed to give a "two-y" lower bound on
1077  // // inv(y).
1078  // z += umulh(z, -y * z);
1079  //
1080  // // Quotient/remainder estimate.
1081  // unsigned q = umulh(x, z);
1082  // unsigned r = x - q * y;
1083  //
1084  // // Two rounds of quotient/remainder refinement.
1085  // if (r >= y) {
1086  // ++q;
1087  // r -= y;
1088  // }
1089  // if (r >= y) {
1090  // ++q;
1091  // r -= y;
1092  // }
1093  //
1094  // return q;
1095  // }
1096 
1097  // Initial estimate of inv(y).
1098  Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1099  Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1100  Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1101  Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
1102  Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1103  Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1104 
1105  // One round of UNR.
1106  Value *NegY = Builder.CreateSub(Zero, Y);
1107  Value *NegYZ = Builder.CreateMul(NegY, Z);
1108  Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1109 
1110  // Quotient/remainder estimate.
1111  Value *Q = getMulHu(Builder, X, Z);
1112  Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1113 
1114  // First quotient/remainder refinement.
1115  Value *Cond = Builder.CreateICmpUGE(R, Y);
1116  if (IsDiv)
1117  Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1118  R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1119 
1120  // Second quotient/remainder refinement.
1121  Cond = Builder.CreateICmpUGE(R, Y);
1122  Value *Res;
1123  if (IsDiv)
1124  Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1125  else
1126  Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1127 
1128  if (IsSigned) {
1129  Res = Builder.CreateXor(Res, Sign);
1130  Res = Builder.CreateSub(Res, Sign);
1131  }
1132 
1133  Res = Builder.CreateTrunc(Res, Ty);
1134 
1135  return Res;
1136 }
1137 
1138 Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
1139  BinaryOperator &I,
1140  Value *Num, Value *Den) const {
1141  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1142  return nullptr; // Keep it for later optimization.
1143 
1144  Instruction::BinaryOps Opc = I.getOpcode();
1145 
1146  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1147  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1148 
1149  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1150  if (NumDivBits == -1)
1151  return nullptr;
1152 
1153  Value *Narrowed = nullptr;
1154  if (NumDivBits <= 24) {
1155  Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1156  IsDiv, IsSigned);
1157  } else if (NumDivBits <= 32) {
1158  Narrowed = expandDivRem32(Builder, I, Num, Den);
1159  }
1160 
1161  if (Narrowed) {
1162  return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1163  Builder.CreateZExt(Narrowed, Num->getType());
1164  }
1165 
1166  return nullptr;
1167 }
1168 
1169 void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
1170  Instruction::BinaryOps Opc = I.getOpcode();
1171  // Do the general expansion.
1172  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1174  return;
1175  }
1176 
1177  if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1179  return;
1180  }
1181 
1182  llvm_unreachable("not a division");
1183 }
1184 
1185 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
1186  if (foldBinOpIntoSelect(I))
1187  return true;
1188 
1189  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1190  DA->isUniform(&I) && promoteUniformOpToI32(I))
1191  return true;
1192 
1193  if (UseMul24Intrin && replaceMulWithMul24(I))
1194  return true;
1195 
1196  bool Changed = false;
1197  Instruction::BinaryOps Opc = I.getOpcode();
1198  Type *Ty = I.getType();
1199  Value *NewDiv = nullptr;
1200  unsigned ScalarSize = Ty->getScalarSizeInBits();
1201 
1202  SmallVector<BinaryOperator *, 8> Div64ToExpand;
1203 
1204  if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1205  Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1206  ScalarSize <= 64 &&
1207  !DisableIDivExpand) {
1208  Value *Num = I.getOperand(0);
1209  Value *Den = I.getOperand(1);
1210  IRBuilder<> Builder(&I);
1211  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1212 
1213  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1214  NewDiv = UndefValue::get(VT);
1215 
1216  for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1217  Value *NumEltN = Builder.CreateExtractElement(Num, N);
1218  Value *DenEltN = Builder.CreateExtractElement(Den, N);
1219 
1220  Value *NewElt;
1221  if (ScalarSize <= 32) {
1222  NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1223  if (!NewElt)
1224  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1225  } else {
1226  // See if this 64-bit division can be shrunk to 32/24-bits before
1227  // producing the general expansion.
1228  NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1229  if (!NewElt) {
1230  // The general 64-bit expansion introduces control flow and doesn't
1231  // return the new value. Just insert a scalar copy and defer
1232  // expanding it.
1233  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1234  Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1235  }
1236  }
1237 
1238  NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1239  }
1240  } else {
1241  if (ScalarSize <= 32)
1242  NewDiv = expandDivRem32(Builder, I, Num, Den);
1243  else {
1244  NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1245  if (!NewDiv)
1246  Div64ToExpand.push_back(&I);
1247  }
1248  }
1249 
1250  if (NewDiv) {
1251  I.replaceAllUsesWith(NewDiv);
1252  I.eraseFromParent();
1253  Changed = true;
1254  }
1255  }
1256 
1257  if (ExpandDiv64InIR) {
1258  // TODO: We get much worse code in specially handled constant cases.
1259  for (BinaryOperator *Div : Div64ToExpand) {
1260  expandDivRem64(*Div);
1261  Changed = true;
1262  }
1263  }
1264 
1265  return Changed;
1266 }
1267 
1268 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
1269  if (!WidenLoads)
1270  return false;
1271 
1272  if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1273  I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1274  canWidenScalarExtLoad(I)) {
1275  IRBuilder<> Builder(&I);
1276  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1277 
1278  Type *I32Ty = Builder.getInt32Ty();
1279  Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
1280  Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
1281  LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
1282  WidenLoad->copyMetadata(I);
1283 
1284  // If we have range metadata, we need to convert the type, and not make
1285  // assumptions about the high bits.
1286  if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1287  ConstantInt *Lower =
1288  mdconst::extract<ConstantInt>(Range->getOperand(0));
1289 
1290  if (Lower->getValue().isNullValue()) {
1291  WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1292  } else {
1293  Metadata *LowAndHigh[] = {
1294  ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1295  // Don't make assumptions about the high bits.
1297  };
1298 
1299  WidenLoad->setMetadata(LLVMContext::MD_range,
1300  MDNode::get(Mod->getContext(), LowAndHigh));
1301  }
1302  }
1303 
1304  int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1305  Type *IntNTy = Builder.getIntNTy(TySize);
1306  Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1307  Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1308  I.replaceAllUsesWith(ValOrig);
1309  I.eraseFromParent();
1310  return true;
1311  }
1312 
1313  return false;
1314 }
1315 
1316 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
1317  bool Changed = false;
1318 
1319  if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1320  DA->isUniform(&I))
1321  Changed |= promoteUniformOpToI32(I);
1322 
1323  return Changed;
1324 }
1325 
1326 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1327  bool Changed = false;
1328 
1329  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1330  DA->isUniform(&I))
1331  Changed |= promoteUniformOpToI32(I);
1332 
1333  return Changed;
1334 }
1335 
1336 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
1337  switch (I.getIntrinsicID()) {
1338  case Intrinsic::bitreverse:
1339  return visitBitreverseIntrinsicInst(I);
1340  default:
1341  return false;
1342  }
1343 }
1344 
1345 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
1346  bool Changed = false;
1347 
1348  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1349  DA->isUniform(&I))
1350  Changed |= promoteUniformBitreverseToI32(I);
1351 
1352  return Changed;
1353 }
1354 
1355 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
1356  Mod = &M;
1357  DL = &Mod->getDataLayout();
1358  return false;
1359 }
1360 
1362  if (skipFunction(F))
1363  return false;
1364 
1365  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1366  if (!TPC)
1367  return false;
1368 
1369  const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
1370  ST = &TM.getSubtarget<GCNSubtarget>(F);
1371  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1372  DA = &getAnalysis<LegacyDivergenceAnalysis>();
1373 
1374  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1375  DT = DTWP ? &DTWP->getDomTree() : nullptr;
1376 
1377  HasUnsafeFPMath = hasUnsafeFPMath(F);
1378 
1380  HasFP32Denormals = Mode.allFP32Denormals();
1381 
1382  bool MadeChange = false;
1383 
1384  Function::iterator NextBB;
1385  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
1386  BasicBlock *BB = &*FI;
1387  NextBB = std::next(FI);
1388 
1389  BasicBlock::iterator Next;
1390  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
1391  Next = std::next(I);
1392 
1393  MadeChange |= visit(*I);
1394 
1395  if (Next != E) { // Control flow changed
1396  BasicBlock *NextInstBB = Next->getParent();
1397  if (NextInstBB != BB) {
1398  BB = NextInstBB;
1399  E = BB->end();
1400  FE = F.end();
1401  }
1402  }
1403  }
1404  }
1405 
1406  return MadeChange;
1407 }
1408 
1409 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
1410  "AMDGPU IR optimizations", false, false)
1413 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
1415 
1416 char AMDGPUCodeGenPrepare::ID = 0;
1417 
1419  return new AMDGPUCodeGenPrepare();
1420 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
AssumptionCache.h
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4543
llvm
Definition: AllocatorList.h:23
llvm::wasm::ValType::I32
@ I32
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:447
llvm::M68kBeads::DA
@ DA
Definition: M68kBaseInfo.h:59
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1295
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::Function
Definition: Function.h:61
llvm::Attribute
Definition: Attributes.h:52
isI24
static bool isI24(SDValue Op, SelectionDAG &DAG)
Definition: AMDGPUISelLowering.cpp:2783
Pass.h
getMul64
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:815
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:693
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:317
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::IRBuilder<>
ValueTracking.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
extractValues
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
Definition: AMDGPUCodeGenPrepare.cpp:465
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:885
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
llvm::ComputeNumSignBits
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition: ValueTracking.cpp:383
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1764
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:800
llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition: Intrinsics.h:45
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
ConstantFolding.h
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:141
llvm::Attribute::getValueAsBool
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:290
optimizeWithFDivFast
static Value * optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, bool HasDenormals, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:697
INITIALIZE_PASS_END
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Definition: RegBankSelect.cpp:69
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1198
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:129
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1330
KnownBits.h
insertValues
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
Definition: AMDGPUCodeGenPrepare.cpp:477
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::MipsISD::Hi
@ Hi
Definition: MipsISelLowering.h:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::KnownBits::isNonNegative
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:99
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:380
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUCodeGenPrepare.cpp:30
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:27
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1762
llvm::FastMathFlags::approxFunc
bool approxFunc() const
Definition: Operator.h:211
f
Itanium Name Demangler i e convert the string _Z1fv into f()". You can also use the CRTP base ManglingParser to perform some simple analysis on the mangled name
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::ConstantFoldBinaryOpOperands
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
Definition: ConstantFolding.cpp:1340
llvm::ConstantFoldCastOperand
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Definition: ConstantFolding.cpp:1351
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:199
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:142
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
optimizeWithRcp
static Value * optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, bool RcpIsAccurate, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:638
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:405
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:395
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:154
llvm::ConstantFP
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:255
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1770
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:885
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
IR
Statically lint checks LLVM IR
Definition: Lint.cpp:744
optimizations
AMDGPU IR optimizations
Definition: AMDGPUCodeGenPrepare.cpp:1413
llvm::KnownBits::isNegative
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:96
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:650
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
hasUnsafeFPMath
static bool hasUnsafeFPMath(const Function &F)
Definition: AMDGPUCodeGenPrepare.cpp:810
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:277
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::cl::opt< bool >
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:464
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1178
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
promotedOpIsNUW
static bool promotedOpIsNUW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:291
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
llvm::SelectInst::getTrueValue
const Value * getTrueValue() const
Definition: Instructions.h:1763
TargetPassConfig.h
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:211
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::FPMathOperator
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:249
llvm::KnownBits::countMinLeadingZeros
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:229
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1715
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:649
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::BinaryOperator
Definition: InstrTypes.h:190
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::expandDivisionUpTo64Bits
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
Definition: IntegerDivision.cpp:631
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:167
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
InstVisitor.h
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:526
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:79
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::FastMathFlags::setFast
void setFast(bool B=true)
Definition: Operator.h:237
llvm::Instruction::copyFastMathFlags
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
Definition: Instruction.cpp:214
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:432
llvm::BitsToFloat
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
Definition: MathExtras.h:643
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
getMulHu
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:829
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::expandRemainderUpTo64Bits
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
Definition: IntegerDivision.cpp:534
llvm::AMDGPUISD::RCP
@ RCP
Definition: AMDGPUISelLowering.h:406
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:347
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::FPMathOperator::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:367
findSelectThroughCast
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
Definition: AMDGPUCodeGenPrepare.cpp:558
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::ConstantFP::get
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:932
LowAndHigh
Metadata * LowAndHigh[]
Definition: NVVMIntrRange.cpp:68
llvm::Type::isDoubleTy
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:151
llvm::ARCCC::Z
@ Z
Definition: ARCInfo.h:41
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:184
IntegerDivision.h
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
promotedOpIsNSW
static bool promotedOpIsNSW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:277
llvm::createAMDGPUCodeGenPreparePass
FunctionPass * createAMDGPUCodeGenPreparePass()
Definition: AMDGPUCodeGenPrepare.cpp:1418
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:768
llvm::FPMathOperator::getFPAccuracy
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Definition: Instructions.cpp:2634
LegacyDivergenceAnalysis.h
WidenLoads
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:365
isU24
static bool isU24(SDValue Op, SelectionDAG &DAG)
Definition: AMDGPUISelLowering.cpp:2779
Dominators.h
getSign32
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
Definition: AMDGPUCodeGenPrepare.cpp:1003
N
#define N
llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:679
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:71
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:164
llvm::SmallVectorImpl< Value * >
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1450
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::isKnownToBeAPowerOfTwo
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
Definition: ValueTracking.cpp:295
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:376
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:376
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:411
llvm::LegalizeActions::Lower
@ Lower
The operation itself must be expressed in terms of simpler actions on this target.
Definition: LegalizerInfo.h:76
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
AMDGPUTargetMachine.h
llvm::Function::iterator
BasicBlockListType::iterator iterator
Definition: Function.h:66
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare