LLVM  14.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
22 #include "llvm/IR/Dominators.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/InitializePasses.h"
27 #include "llvm/Pass.h"
28 #include "llvm/Support/KnownBits.h"
30 
31 #define DEBUG_TYPE "amdgpu-codegenprepare"
32 
33 using namespace llvm;
34 
35 namespace {
36 
38  "amdgpu-codegenprepare-widen-constant-loads",
39  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
41  cl::init(false));
42 
43 static cl::opt<bool> Widen16BitOps(
44  "amdgpu-codegenprepare-widen-16-bit-ops",
45  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
47  cl::init(true));
48 
49 static cl::opt<bool> UseMul24Intrin(
50  "amdgpu-codegenprepare-mul24",
51  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
53  cl::init(true));
54 
55 // Legalize 64-bit division by using the generic IR expansion.
56 static cl::opt<bool> ExpandDiv64InIR(
57  "amdgpu-codegenprepare-expand-div64",
58  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
60  cl::init(false));
61 
62 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
63 // and is used for testing the legalizer.
64 static cl::opt<bool> DisableIDivExpand(
65  "amdgpu-codegenprepare-disable-idiv-expansion",
66  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
68  cl::init(false));
69 
70 class AMDGPUCodeGenPrepare : public FunctionPass,
71  public InstVisitor<AMDGPUCodeGenPrepare, bool> {
72  const GCNSubtarget *ST = nullptr;
73  AssumptionCache *AC = nullptr;
74  DominatorTree *DT = nullptr;
75  LegacyDivergenceAnalysis *DA = nullptr;
76  Module *Mod = nullptr;
77  const DataLayout *DL = nullptr;
78  bool HasUnsafeFPMath = false;
79  bool HasFP32Denormals = false;
80 
81  /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
82  /// binary operation \p V.
83  ///
84  /// \returns Binary operation \p V.
85  /// \returns \p T's base element bit width.
86  unsigned getBaseElementBitWidth(const Type *T) const;
87 
88  /// \returns Equivalent 32 bit integer type for given type \p T. For example,
89  /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
90  /// is returned.
91  Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
92 
93  /// \returns True if binary operation \p I is a signed binary operation, false
94  /// otherwise.
95  bool isSigned(const BinaryOperator &I) const;
96 
97  /// \returns True if the condition of 'select' operation \p I comes from a
98  /// signed 'icmp' operation, false otherwise.
99  bool isSigned(const SelectInst &I) const;
100 
101  /// \returns True if type \p T needs to be promoted to 32 bit integer type,
102  /// false otherwise.
103  bool needsPromotionToI32(const Type *T) const;
104 
105  /// Promotes uniform binary operation \p I to equivalent 32 bit binary
106  /// operation.
107  ///
108  /// \details \p I's base element bit width must be greater than 1 and less
109  /// than or equal 16. Promotion is done by sign or zero extending operands to
110  /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
111  /// truncating the result of 32 bit binary operation back to \p I's original
112  /// type. Division operation is not promoted.
113  ///
114  /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
115  /// false otherwise.
116  bool promoteUniformOpToI32(BinaryOperator &I) const;
117 
118  /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
119  ///
120  /// \details \p I's base element bit width must be greater than 1 and less
121  /// than or equal 16. Promotion is done by sign or zero extending operands to
122  /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
123  ///
124  /// \returns True.
125  bool promoteUniformOpToI32(ICmpInst &I) const;
126 
127  /// Promotes uniform 'select' operation \p I to 32 bit 'select'
128  /// operation.
129  ///
130  /// \details \p I's base element bit width must be greater than 1 and less
131  /// than or equal 16. Promotion is done by sign or zero extending operands to
132  /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
133  /// result of 32 bit 'select' operation back to \p I's original type.
134  ///
135  /// \returns True.
136  bool promoteUniformOpToI32(SelectInst &I) const;
137 
138  /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
139  /// intrinsic.
140  ///
141  /// \details \p I's base element bit width must be greater than 1 and less
142  /// than or equal 16. Promotion is done by zero extending the operand to 32
143  /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
144  /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
145  /// shift amount is 32 minus \p I's base element bit width), and truncating
146  /// the result of the shift operation back to \p I's original type.
147  ///
148  /// \returns True.
149  bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
150 
151 
152  unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
153  unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
154  bool isI24(Value *V, unsigned ScalarSize) const;
155  bool isU24(Value *V, unsigned ScalarSize) const;
156 
157  /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
158  /// SelectionDAG has an issue where an and asserting the bits are known
159  bool replaceMulWithMul24(BinaryOperator &I) const;
160 
161  /// Perform same function as equivalently named function in DAGCombiner. Since
162  /// we expand some divisions here, we need to perform this before obscuring.
163  bool foldBinOpIntoSelect(BinaryOperator &I) const;
164 
165  bool divHasSpecialOptimization(BinaryOperator &I,
166  Value *Num, Value *Den) const;
167  int getDivNumBits(BinaryOperator &I,
168  Value *Num, Value *Den,
169  unsigned AtLeast, bool Signed) const;
170 
171  /// Expands 24 bit div or rem.
172  Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
173  Value *Num, Value *Den,
174  bool IsDiv, bool IsSigned) const;
175 
176  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
177  Value *Num, Value *Den, unsigned NumBits,
178  bool IsDiv, bool IsSigned) const;
179 
180  /// Expands 32 bit div or rem.
181  Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
182  Value *Num, Value *Den) const;
183 
184  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
185  Value *Num, Value *Den) const;
186  void expandDivRem64(BinaryOperator &I) const;
187 
188  /// Widen a scalar load.
189  ///
190  /// \details \p Widen scalar load for uniform, small type loads from constant
191  // memory / to a full 32-bits and then truncate the input to allow a scalar
192  // load instead of a vector load.
193  //
194  /// \returns True.
195 
196  bool canWidenScalarExtLoad(LoadInst &I) const;
197 
198 public:
199  static char ID;
200 
201  AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
202 
203  bool visitFDiv(BinaryOperator &I);
204  bool visitXor(BinaryOperator &I);
205 
206  bool visitInstruction(Instruction &I) { return false; }
207  bool visitBinaryOperator(BinaryOperator &I);
208  bool visitLoadInst(LoadInst &I);
209  bool visitICmpInst(ICmpInst &I);
210  bool visitSelectInst(SelectInst &I);
211 
212  bool visitIntrinsicInst(IntrinsicInst &I);
213  bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
214 
215  bool doInitialization(Module &M) override;
216  bool runOnFunction(Function &F) override;
217 
218  StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
219 
220  void getAnalysisUsage(AnalysisUsage &AU) const override {
223 
224  // FIXME: Division expansion needs to preserve the dominator tree.
225  if (!ExpandDiv64InIR)
226  AU.setPreservesAll();
227  }
228 };
229 
230 } // end anonymous namespace
231 
232 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
233  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
234 
235  if (T->isIntegerTy())
236  return T->getIntegerBitWidth();
237  return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
238 }
239 
240 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
241  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
242 
243  if (T->isIntegerTy())
244  return B.getInt32Ty();
245  return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
246 }
247 
248 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
249  return I.getOpcode() == Instruction::AShr ||
250  I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
251 }
252 
253 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
254  return isa<ICmpInst>(I.getOperand(0)) ?
255  cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
256 }
257 
258 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
259  if (!Widen16BitOps)
260  return false;
261 
262  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
263  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
264  return true;
265 
266  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
267  // TODO: The set of packed operations is more limited, so may want to
268  // promote some anyway.
269  if (ST->hasVOP3PInsts())
270  return false;
271 
272  return needsPromotionToI32(VT->getElementType());
273  }
274 
275  return false;
276 }
277 
278 // Return true if the op promoted to i32 should have nsw set.
279 static bool promotedOpIsNSW(const Instruction &I) {
280  switch (I.getOpcode()) {
281  case Instruction::Shl:
282  case Instruction::Add:
283  case Instruction::Sub:
284  return true;
285  case Instruction::Mul:
286  return I.hasNoUnsignedWrap();
287  default:
288  return false;
289  }
290 }
291 
292 // Return true if the op promoted to i32 should have nuw set.
293 static bool promotedOpIsNUW(const Instruction &I) {
294  switch (I.getOpcode()) {
295  case Instruction::Shl:
296  case Instruction::Add:
297  case Instruction::Mul:
298  return true;
299  case Instruction::Sub:
300  return I.hasNoUnsignedWrap();
301  default:
302  return false;
303  }
304 }
305 
306 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
307  Type *Ty = I.getType();
308  const DataLayout &DL = Mod->getDataLayout();
309  int TySize = DL.getTypeSizeInBits(Ty);
310  Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
311 
312  return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
313 }
314 
315 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
316  assert(needsPromotionToI32(I.getType()) &&
317  "I does not need promotion to i32");
318 
319  if (I.getOpcode() == Instruction::SDiv ||
320  I.getOpcode() == Instruction::UDiv ||
321  I.getOpcode() == Instruction::SRem ||
322  I.getOpcode() == Instruction::URem)
323  return false;
324 
326  Builder.SetCurrentDebugLocation(I.getDebugLoc());
327 
328  Type *I32Ty = getI32Ty(Builder, I.getType());
329  Value *ExtOp0 = nullptr;
330  Value *ExtOp1 = nullptr;
331  Value *ExtRes = nullptr;
332  Value *TruncRes = nullptr;
333 
334  if (isSigned(I)) {
335  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
336  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
337  } else {
338  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
339  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
340  }
341 
342  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
343  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
344  if (promotedOpIsNSW(cast<Instruction>(I)))
345  Inst->setHasNoSignedWrap();
346 
347  if (promotedOpIsNUW(cast<Instruction>(I)))
348  Inst->setHasNoUnsignedWrap();
349 
350  if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
351  Inst->setIsExact(ExactOp->isExact());
352  }
353 
354  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
355 
356  I.replaceAllUsesWith(TruncRes);
357  I.eraseFromParent();
358 
359  return true;
360 }
361 
362 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
363  assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
364  "I does not need promotion to i32");
365 
367  Builder.SetCurrentDebugLocation(I.getDebugLoc());
368 
369  Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
370  Value *ExtOp0 = nullptr;
371  Value *ExtOp1 = nullptr;
372  Value *NewICmp = nullptr;
373 
374  if (I.isSigned()) {
375  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
376  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
377  } else {
378  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
379  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
380  }
381  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
382 
383  I.replaceAllUsesWith(NewICmp);
384  I.eraseFromParent();
385 
386  return true;
387 }
388 
389 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
390  assert(needsPromotionToI32(I.getType()) &&
391  "I does not need promotion to i32");
392 
394  Builder.SetCurrentDebugLocation(I.getDebugLoc());
395 
396  Type *I32Ty = getI32Ty(Builder, I.getType());
397  Value *ExtOp1 = nullptr;
398  Value *ExtOp2 = nullptr;
399  Value *ExtRes = nullptr;
400  Value *TruncRes = nullptr;
401 
402  if (isSigned(I)) {
403  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
404  ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
405  } else {
406  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
407  ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
408  }
409  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
410  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
411 
412  I.replaceAllUsesWith(TruncRes);
413  I.eraseFromParent();
414 
415  return true;
416 }
417 
418 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
419  IntrinsicInst &I) const {
420  assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
421  "I must be bitreverse intrinsic");
422  assert(needsPromotionToI32(I.getType()) &&
423  "I does not need promotion to i32");
424 
426  Builder.SetCurrentDebugLocation(I.getDebugLoc());
427 
428  Type *I32Ty = getI32Ty(Builder, I.getType());
429  Function *I32 =
430  Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
431  Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
432  Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
433  Value *LShrOp =
434  Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
435  Value *TruncRes =
436  Builder.CreateTrunc(LShrOp, I.getType());
437 
438  I.replaceAllUsesWith(TruncRes);
439  I.eraseFromParent();
440 
441  return true;
442 }
443 
444 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
445  unsigned ScalarSize) const {
446  KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
447  return ScalarSize - Known.countMinLeadingZeros();
448 }
449 
450 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
451  unsigned ScalarSize) const {
452  // In order for this to be a signed 24-bit value, bit 23, must
453  // be a sign bit.
454  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
455 }
456 
457 bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
458  return ScalarSize >= 24 && // Types less than 24-bit should be treated
459  // as unsigned 24-bit values.
460  numBitsSigned(V, ScalarSize) < 24;
461 }
462 
463 bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
464  return numBitsUnsigned(V, ScalarSize) <= 24;
465 }
466 
468  SmallVectorImpl<Value *> &Values, Value *V) {
469  auto *VT = dyn_cast<FixedVectorType>(V->getType());
470  if (!VT) {
471  Values.push_back(V);
472  return;
473  }
474 
475  for (int I = 0, E = VT->getNumElements(); I != E; ++I)
476  Values.push_back(Builder.CreateExtractElement(V, I));
477 }
478 
480  Type *Ty,
481  SmallVectorImpl<Value *> &Values) {
482  if (Values.size() == 1)
483  return Values[0];
484 
485  Value *NewVal = UndefValue::get(Ty);
486  for (int I = 0, E = Values.size(); I != E; ++I)
487  NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
488 
489  return NewVal;
490 }
491 
492 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
493  if (I.getOpcode() != Instruction::Mul)
494  return false;
495 
496  Type *Ty = I.getType();
497  unsigned Size = Ty->getScalarSizeInBits();
498  if (Size <= 16 && ST->has16BitInsts())
499  return false;
500 
501  // Prefer scalar if this could be s_mul_i32
502  if (DA->isUniform(&I))
503  return false;
504 
505  Value *LHS = I.getOperand(0);
506  Value *RHS = I.getOperand(1);
508  Builder.SetCurrentDebugLocation(I.getDebugLoc());
509 
511 
512  // TODO: Should this try to match mulhi24?
513  if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
514  IntrID = Intrinsic::amdgcn_mul_u24;
515  } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
516  IntrID = Intrinsic::amdgcn_mul_i24;
517  } else
518  return false;
519 
520  SmallVector<Value *, 4> LHSVals;
521  SmallVector<Value *, 4> RHSVals;
522  SmallVector<Value *, 4> ResultVals;
523  extractValues(Builder, LHSVals, LHS);
524  extractValues(Builder, RHSVals, RHS);
525 
526 
527  IntegerType *I32Ty = Builder.getInt32Ty();
528  FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
529  for (int I = 0, E = LHSVals.size(); I != E; ++I) {
530  Value *LHS, *RHS;
531  if (IntrID == Intrinsic::amdgcn_mul_u24) {
532  LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
533  RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
534  } else {
535  LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
536  RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
537  }
538 
539  Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});
540 
541  if (IntrID == Intrinsic::amdgcn_mul_u24) {
542  ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
543  LHSVals[I]->getType()));
544  } else {
545  ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
546  LHSVals[I]->getType()));
547  }
548  }
549 
550  Value *NewVal = insertValues(Builder, Ty, ResultVals);
551  NewVal->takeName(&I);
552  I.replaceAllUsesWith(NewVal);
553  I.eraseFromParent();
554 
555  return true;
556 }
557 
558 // Find a select instruction, which may have been casted. This is mostly to deal
559 // with cases where i16 selects were promoted here to i32.
561  Cast = nullptr;
562  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
563  return Sel;
564 
565  if ((Cast = dyn_cast<CastInst>(V))) {
566  if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
567  return Sel;
568  }
569 
570  return nullptr;
571 }
572 
573 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
574  // Don't do this unless the old select is going away. We want to eliminate the
575  // binary operator, not replace a binop with a select.
576  int SelOpNo = 0;
577 
578  CastInst *CastOp;
579 
580  // TODO: Should probably try to handle some cases with multiple
581  // users. Duplicating the select may be profitable for division.
582  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
583  if (!Sel || !Sel->hasOneUse()) {
584  SelOpNo = 1;
585  Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
586  }
587 
588  if (!Sel || !Sel->hasOneUse())
589  return false;
590 
591  Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
592  Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
593  Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
594  if (!CBO || !CT || !CF)
595  return false;
596 
597  if (CastOp) {
598  if (!CastOp->hasOneUse())
599  return false;
600  CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
601  CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
602  }
603 
604  // TODO: Handle special 0/-1 cases DAG combine does, although we only really
605  // need to handle divisions here.
606  Constant *FoldedT = SelOpNo ?
607  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
608  ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
609  if (isa<ConstantExpr>(FoldedT))
610  return false;
611 
612  Constant *FoldedF = SelOpNo ?
613  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
614  ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
615  if (isa<ConstantExpr>(FoldedF))
616  return false;
617 
618  IRBuilder<> Builder(&BO);
619  Builder.SetCurrentDebugLocation(BO.getDebugLoc());
620  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
621  Builder.setFastMathFlags(FPOp->getFastMathFlags());
622 
623  Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
624  FoldedT, FoldedF);
625  NewSelect->takeName(&BO);
626  BO.replaceAllUsesWith(NewSelect);
627  BO.eraseFromParent();
628  if (CastOp)
629  CastOp->eraseFromParent();
630  Sel->eraseFromParent();
631  return true;
632 }
633 
634 // Optimize fdiv with rcp:
635 //
636 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
637 // allowed with unsafe-fp-math or afn.
638 //
639 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
640 static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
641  bool RcpIsAccurate, IRBuilder<> &Builder,
642  Module *Mod) {
643 
644  if (!AllowInaccurateRcp && !RcpIsAccurate)
645  return nullptr;
646 
647  Type *Ty = Den->getType();
648  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
649  if (AllowInaccurateRcp || RcpIsAccurate) {
650  if (CLHS->isExactlyValue(1.0)) {
652  Mod, Intrinsic::amdgcn_rcp, Ty);
653 
654  // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
655  // the CI documentation has a worst case error of 1 ulp.
656  // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
657  // use it as long as we aren't trying to use denormals.
658  //
659  // v_rcp_f16 and v_rsq_f16 DO support denormals.
660 
661  // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
662  // insert rsq intrinsic here.
663 
664  // 1.0 / x -> rcp(x)
665  return Builder.CreateCall(Decl, { Den });
666  }
667 
668  // Same as for 1.0, but expand the sign out of the constant.
669  if (CLHS->isExactlyValue(-1.0)) {
671  Mod, Intrinsic::amdgcn_rcp, Ty);
672 
673  // -1.0 / x -> rcp (fneg x)
674  Value *FNeg = Builder.CreateFNeg(Den);
675  return Builder.CreateCall(Decl, { FNeg });
676  }
677  }
678  }
679 
680  if (AllowInaccurateRcp) {
682  Mod, Intrinsic::amdgcn_rcp, Ty);
683 
684  // Turn into multiply by the reciprocal.
685  // x / y -> x * (1.0 / y)
686  Value *Recip = Builder.CreateCall(Decl, { Den });
687  return Builder.CreateFMul(Num, Recip);
688  }
689  return nullptr;
690 }
691 
692 // optimize with fdiv.fast:
693 //
694 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
695 //
696 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
697 //
698 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
699 static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
700  bool HasDenormals, IRBuilder<> &Builder,
701  Module *Mod) {
702  // fdiv.fast can achieve 2.5 ULP accuracy.
703  if (ReqdAccuracy < 2.5f)
704  return nullptr;
705 
706  // Only have fdiv.fast for f32.
707  Type *Ty = Den->getType();
708  if (!Ty->isFloatTy())
709  return nullptr;
710 
711  bool NumIsOne = false;
712  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
713  if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
714  NumIsOne = true;
715  }
716 
717  // fdiv does not support denormals. But 1.0/x is always fine to use it.
718  if (HasDenormals && !NumIsOne)
719  return nullptr;
720 
721  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
722  return Builder.CreateCall(Decl, { Num, Den });
723 }
724 
725 // Optimizations is performed based on fpmath, fast math flags as well as
726 // denormals to optimize fdiv with either rcp or fdiv.fast.
727 //
728 // With rcp:
729 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
730 // allowed with unsafe-fp-math or afn.
731 //
732 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
733 //
734 // With fdiv.fast:
735 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
736 //
737 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
738 //
739 // NOTE: rcp is the preference in cases that both are legal.
740 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
741 
742  Type *Ty = FDiv.getType()->getScalarType();
743 
744  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
745  // expansion around them in codegen.
746  if (Ty->isDoubleTy())
747  return false;
748 
749  // No intrinsic for fdiv16 if target does not support f16.
750  if (Ty->isHalfTy() && !ST->has16BitInsts())
751  return false;
752 
753  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
754  const float ReqdAccuracy = FPOp->getFPAccuracy();
755 
756  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
757  FastMathFlags FMF = FPOp->getFastMathFlags();
758  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
759 
760  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
761  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
762  // rcp_f64 is never accurate.
763  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
764  (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
765 
766  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
767  Builder.setFastMathFlags(FMF);
768  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
769 
770  Value *Num = FDiv.getOperand(0);
771  Value *Den = FDiv.getOperand(1);
772 
773  Value *NewFDiv = nullptr;
774  if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
775  NewFDiv = UndefValue::get(VT);
776 
777  // FIXME: Doesn't do the right thing for cases where the vector is partially
778  // constant. This works when the scalarizer pass is run first.
779  for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
780  Value *NumEltI = Builder.CreateExtractElement(Num, I);
781  Value *DenEltI = Builder.CreateExtractElement(Den, I);
782  // Try rcp first.
783  Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
784  RcpIsAccurate, Builder, Mod);
785  if (!NewElt) // Try fdiv.fast.
786  NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
787  HasFP32Denormals, Builder, Mod);
788  if (!NewElt) // Keep the original.
789  NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
790 
791  NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
792  }
793  } else { // Scalar FDiv.
794  // Try rcp first.
795  NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
796  Builder, Mod);
797  if (!NewFDiv) { // Try fdiv.fast.
798  NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
799  Builder, Mod);
800  }
801  }
802 
803  if (NewFDiv) {
804  FDiv.replaceAllUsesWith(NewFDiv);
805  NewFDiv->takeName(&FDiv);
806  FDiv.eraseFromParent();
807  }
808 
809  return !!NewFDiv;
810 }
811 
812 bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
813  // Match the Xor instruction, its type and its operands
814  IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
815  ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
816  if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
817  return visitBinaryOperator(I);
818 
819  // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
820  // has only one use
821  if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
822  !IntrinsicCall->hasOneUse())
823  return visitBinaryOperator(I);
824 
825  // "Not" the second argument of the intrinsic call
826  ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
827  if (!Arg)
828  return visitBinaryOperator(I);
829 
830  IntrinsicCall->setOperand(
831  1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
832  I.replaceAllUsesWith(IntrinsicCall);
833  I.eraseFromParent();
834  return true;
835 }
836 
837 static bool hasUnsafeFPMath(const Function &F) {
838  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
839  return Attr.getValueAsBool();
840 }
841 
842 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
843  Value *LHS, Value *RHS) {
844  Type *I32Ty = Builder.getInt32Ty();
845  Type *I64Ty = Builder.getInt64Ty();
846 
847  Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
848  Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
849  Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
850  Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
851  Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
852  Hi = Builder.CreateTrunc(Hi, I32Ty);
853  return std::make_pair(Lo, Hi);
854 }
855 
856 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
857  return getMul64(Builder, LHS, RHS).second;
858 }
859 
860 /// Figure out how many bits are really needed for this ddivision. \p AtLeast is
861 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
862 /// first one is insufficient. Returns -1 on failure.
863 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
864  Value *Num, Value *Den,
865  unsigned AtLeast, bool IsSigned) const {
866  const DataLayout &DL = Mod->getDataLayout();
867  unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
868  if (LHSSignBits < AtLeast)
869  return -1;
870 
871  unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
872  if (RHSSignBits < AtLeast)
873  return -1;
874 
875  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
876  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
877  if (IsSigned)
878  ++DivBits;
879  return DivBits;
880 }
881 
882 // The fractional part of a float is enough to accurately represent up to
883 // a 24-bit signed integer.
884 Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
885  BinaryOperator &I,
886  Value *Num, Value *Den,
887  bool IsDiv, bool IsSigned) const {
888  int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
889  if (DivBits == -1)
890  return nullptr;
891  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
892 }
893 
894 Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
895  BinaryOperator &I,
896  Value *Num, Value *Den,
897  unsigned DivBits,
898  bool IsDiv, bool IsSigned) const {
899  Type *I32Ty = Builder.getInt32Ty();
900  Num = Builder.CreateTrunc(Num, I32Ty);
901  Den = Builder.CreateTrunc(Den, I32Ty);
902 
903  Type *F32Ty = Builder.getFloatTy();
904  ConstantInt *One = Builder.getInt32(1);
905  Value *JQ = One;
906 
907  if (IsSigned) {
908  // char|short jq = ia ^ ib;
909  JQ = Builder.CreateXor(Num, Den);
910 
911  // jq = jq >> (bitsize - 2)
912  JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
913 
914  // jq = jq | 0x1
915  JQ = Builder.CreateOr(JQ, One);
916  }
917 
918  // int ia = (int)LHS;
919  Value *IA = Num;
920 
921  // int ib, (int)RHS;
922  Value *IB = Den;
923 
924  // float fa = (float)ia;
925  Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
926  : Builder.CreateUIToFP(IA, F32Ty);
927 
928  // float fb = (float)ib;
929  Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
930  : Builder.CreateUIToFP(IB,F32Ty);
931 
932  Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
933  Builder.getFloatTy());
934  Value *RCP = Builder.CreateCall(RcpDecl, { FB });
935  Value *FQM = Builder.CreateFMul(FA, RCP);
936 
937  // fq = trunc(fqm);
938  CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
939  FQ->copyFastMathFlags(Builder.getFastMathFlags());
940 
941  // float fqneg = -fq;
942  Value *FQNeg = Builder.CreateFNeg(FQ);
943 
944  // float fr = mad(fqneg, fb, fa);
945  auto FMAD = !ST->hasMadMacF32Insts()
946  ? Intrinsic::fma
947  : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
948  Value *FR = Builder.CreateIntrinsic(FMAD,
949  {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
950 
951  // int iq = (int)fq;
952  Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
953  : Builder.CreateFPToUI(FQ, I32Ty);
954 
955  // fr = fabs(fr);
956  FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
957 
958  // fb = fabs(fb);
959  FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
960 
961  // int cv = fr >= fb;
962  Value *CV = Builder.CreateFCmpOGE(FR, FB);
963 
964  // jq = (cv ? jq : 0);
965  JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
966 
967  // dst = iq + jq;
968  Value *Div = Builder.CreateAdd(IQ, JQ);
969 
970  Value *Res = Div;
971  if (!IsDiv) {
972  // Rem needs compensation, it's easier to recompute it
973  Value *Rem = Builder.CreateMul(Div, Den);
974  Res = Builder.CreateSub(Num, Rem);
975  }
976 
977  if (DivBits != 0 && DivBits < 32) {
978  // Extend in register from the number of bits this divide really is.
979  if (IsSigned) {
980  int InRegBits = 32 - DivBits;
981 
982  Res = Builder.CreateShl(Res, InRegBits);
983  Res = Builder.CreateAShr(Res, InRegBits);
984  } else {
985  ConstantInt *TruncMask
986  = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
987  Res = Builder.CreateAnd(Res, TruncMask);
988  }
989  }
990 
991  return Res;
992 }
993 
994 // Try to recognize special cases the DAG will emit special, better expansions
995 // than the general expansion we do here.
996 
997 // TODO: It would be better to just directly handle those optimizations here.
998 bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
999  BinaryOperator &I, Value *Num, Value *Den) const {
1000  if (Constant *C = dyn_cast<Constant>(Den)) {
1001  // Arbitrary constants get a better expansion as long as a wider mulhi is
1002  // legal.
1003  if (C->getType()->getScalarSizeInBits() <= 32)
1004  return true;
1005 
1006  // TODO: Sdiv check for not exact for some reason.
1007 
1008  // If there's no wider mulhi, there's only a better expansion for powers of
1009  // two.
1010  // TODO: Should really know for each vector element.
1011  if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1012  return true;
1013 
1014  return false;
1015  }
1016 
1017  if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1018  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1019  if (BinOpDen->getOpcode() == Instruction::Shl &&
1020  isa<Constant>(BinOpDen->getOperand(0)) &&
1021  isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1022  0, AC, &I, DT)) {
1023  return true;
1024  }
1025  }
1026 
1027  return false;
1028 }
1029 
1031  // Check whether the sign can be determined statically.
1032  KnownBits Known = computeKnownBits(V, *DL);
1033  if (Known.isNegative())
1034  return Constant::getAllOnesValue(V->getType());
1035  if (Known.isNonNegative())
1036  return Constant::getNullValue(V->getType());
1037  return Builder.CreateAShr(V, Builder.getInt32(31));
1038 }
1039 
1040 Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
1041  BinaryOperator &I, Value *X,
1042  Value *Y) const {
1043  Instruction::BinaryOps Opc = I.getOpcode();
1044  assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1045  Opc == Instruction::SRem || Opc == Instruction::SDiv);
1046 
1047  FastMathFlags FMF;
1048  FMF.setFast();
1049  Builder.setFastMathFlags(FMF);
1050 
1051  if (divHasSpecialOptimization(I, X, Y))
1052  return nullptr; // Keep it for later optimization.
1053 
1054  bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1055  bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1056 
1057  Type *Ty = X->getType();
1058  Type *I32Ty = Builder.getInt32Ty();
1059  Type *F32Ty = Builder.getFloatTy();
1060 
1061  if (Ty->getScalarSizeInBits() < 32) {
1062  if (IsSigned) {
1063  X = Builder.CreateSExt(X, I32Ty);
1064  Y = Builder.CreateSExt(Y, I32Ty);
1065  } else {
1066  X = Builder.CreateZExt(X, I32Ty);
1067  Y = Builder.CreateZExt(Y, I32Ty);
1068  }
1069  }
1070 
1071  if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1072  return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1073  Builder.CreateZExtOrTrunc(Res, Ty);
1074  }
1075 
1076  ConstantInt *Zero = Builder.getInt32(0);
1077  ConstantInt *One = Builder.getInt32(1);
1078 
1079  Value *Sign = nullptr;
1080  if (IsSigned) {
1081  Value *SignX = getSign32(X, Builder, DL);
1082  Value *SignY = getSign32(Y, Builder, DL);
1083  // Remainder sign is the same as LHS
1084  Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1085 
1086  X = Builder.CreateAdd(X, SignX);
1087  Y = Builder.CreateAdd(Y, SignY);
1088 
1089  X = Builder.CreateXor(X, SignX);
1090  Y = Builder.CreateXor(Y, SignY);
1091  }
1092 
1093  // The algorithm here is based on ideas from "Software Integer Division", Tom
1094  // Rodeheffer, August 2008.
1095  //
1096  // unsigned udiv(unsigned x, unsigned y) {
1097  // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1098  // // that this is a lower bound on inv(y), even if some of the calculations
1099  // // round up.
1100  // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1101  //
1102  // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1103  // // Empirically this is guaranteed to give a "two-y" lower bound on
1104  // // inv(y).
1105  // z += umulh(z, -y * z);
1106  //
1107  // // Quotient/remainder estimate.
1108  // unsigned q = umulh(x, z);
1109  // unsigned r = x - q * y;
1110  //
1111  // // Two rounds of quotient/remainder refinement.
1112  // if (r >= y) {
1113  // ++q;
1114  // r -= y;
1115  // }
1116  // if (r >= y) {
1117  // ++q;
1118  // r -= y;
1119  // }
1120  //
1121  // return q;
1122  // }
1123 
1124  // Initial estimate of inv(y).
1125  Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1126  Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1127  Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1128  Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
1129  Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1130  Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1131 
1132  // One round of UNR.
1133  Value *NegY = Builder.CreateSub(Zero, Y);
1134  Value *NegYZ = Builder.CreateMul(NegY, Z);
1135  Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1136 
1137  // Quotient/remainder estimate.
1138  Value *Q = getMulHu(Builder, X, Z);
1139  Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1140 
1141  // First quotient/remainder refinement.
1142  Value *Cond = Builder.CreateICmpUGE(R, Y);
1143  if (IsDiv)
1144  Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1145  R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1146 
1147  // Second quotient/remainder refinement.
1148  Cond = Builder.CreateICmpUGE(R, Y);
1149  Value *Res;
1150  if (IsDiv)
1151  Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1152  else
1153  Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1154 
1155  if (IsSigned) {
1156  Res = Builder.CreateXor(Res, Sign);
1157  Res = Builder.CreateSub(Res, Sign);
1158  }
1159 
1160  Res = Builder.CreateTrunc(Res, Ty);
1161 
1162  return Res;
1163 }
1164 
1165 Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
1166  BinaryOperator &I,
1167  Value *Num, Value *Den) const {
1168  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1169  return nullptr; // Keep it for later optimization.
1170 
1171  Instruction::BinaryOps Opc = I.getOpcode();
1172 
1173  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1174  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1175 
1176  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1177  if (NumDivBits == -1)
1178  return nullptr;
1179 
1180  Value *Narrowed = nullptr;
1181  if (NumDivBits <= 24) {
1182  Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1183  IsDiv, IsSigned);
1184  } else if (NumDivBits <= 32) {
1185  Narrowed = expandDivRem32(Builder, I, Num, Den);
1186  }
1187 
1188  if (Narrowed) {
1189  return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1190  Builder.CreateZExt(Narrowed, Num->getType());
1191  }
1192 
1193  return nullptr;
1194 }
1195 
1196 void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
1197  Instruction::BinaryOps Opc = I.getOpcode();
1198  // Do the general expansion.
1199  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1201  return;
1202  }
1203 
1204  if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1206  return;
1207  }
1208 
1209  llvm_unreachable("not a division");
1210 }
1211 
1212 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
1213  if (foldBinOpIntoSelect(I))
1214  return true;
1215 
1216  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1217  DA->isUniform(&I) && promoteUniformOpToI32(I))
1218  return true;
1219 
1220  if (UseMul24Intrin && replaceMulWithMul24(I))
1221  return true;
1222 
1223  bool Changed = false;
1224  Instruction::BinaryOps Opc = I.getOpcode();
1225  Type *Ty = I.getType();
1226  Value *NewDiv = nullptr;
1227  unsigned ScalarSize = Ty->getScalarSizeInBits();
1228 
1229  SmallVector<BinaryOperator *, 8> Div64ToExpand;
1230 
1231  if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1232  Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1233  ScalarSize <= 64 &&
1234  !DisableIDivExpand) {
1235  Value *Num = I.getOperand(0);
1236  Value *Den = I.getOperand(1);
1237  IRBuilder<> Builder(&I);
1238  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1239 
1240  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1241  NewDiv = UndefValue::get(VT);
1242 
1243  for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1244  Value *NumEltN = Builder.CreateExtractElement(Num, N);
1245  Value *DenEltN = Builder.CreateExtractElement(Den, N);
1246 
1247  Value *NewElt;
1248  if (ScalarSize <= 32) {
1249  NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1250  if (!NewElt)
1251  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1252  } else {
1253  // See if this 64-bit division can be shrunk to 32/24-bits before
1254  // producing the general expansion.
1255  NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1256  if (!NewElt) {
1257  // The general 64-bit expansion introduces control flow and doesn't
1258  // return the new value. Just insert a scalar copy and defer
1259  // expanding it.
1260  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1261  Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1262  }
1263  }
1264 
1265  NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1266  }
1267  } else {
1268  if (ScalarSize <= 32)
1269  NewDiv = expandDivRem32(Builder, I, Num, Den);
1270  else {
1271  NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1272  if (!NewDiv)
1273  Div64ToExpand.push_back(&I);
1274  }
1275  }
1276 
1277  if (NewDiv) {
1278  I.replaceAllUsesWith(NewDiv);
1279  I.eraseFromParent();
1280  Changed = true;
1281  }
1282  }
1283 
1284  if (ExpandDiv64InIR) {
1285  // TODO: We get much worse code in specially handled constant cases.
1286  for (BinaryOperator *Div : Div64ToExpand) {
1287  expandDivRem64(*Div);
1288  Changed = true;
1289  }
1290  }
1291 
1292  return Changed;
1293 }
1294 
1295 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
1296  if (!WidenLoads)
1297  return false;
1298 
1299  if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1300  I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1301  canWidenScalarExtLoad(I)) {
1302  IRBuilder<> Builder(&I);
1303  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1304 
1305  Type *I32Ty = Builder.getInt32Ty();
1306  Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
1307  Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
1308  LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
1309  WidenLoad->copyMetadata(I);
1310 
1311  // If we have range metadata, we need to convert the type, and not make
1312  // assumptions about the high bits.
1313  if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1314  ConstantInt *Lower =
1315  mdconst::extract<ConstantInt>(Range->getOperand(0));
1316 
1317  if (Lower->isNullValue()) {
1318  WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1319  } else {
1320  Metadata *LowAndHigh[] = {
1321  ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1322  // Don't make assumptions about the high bits.
1324  };
1325 
1326  WidenLoad->setMetadata(LLVMContext::MD_range,
1328  }
1329  }
1330 
1331  int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1332  Type *IntNTy = Builder.getIntNTy(TySize);
1333  Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1334  Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1335  I.replaceAllUsesWith(ValOrig);
1336  I.eraseFromParent();
1337  return true;
1338  }
1339 
1340  return false;
1341 }
1342 
1343 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
1344  bool Changed = false;
1345 
1346  if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1347  DA->isUniform(&I))
1348  Changed |= promoteUniformOpToI32(I);
1349 
1350  return Changed;
1351 }
1352 
1353 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1354  bool Changed = false;
1355 
1356  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1357  DA->isUniform(&I))
1358  Changed |= promoteUniformOpToI32(I);
1359 
1360  return Changed;
1361 }
1362 
1363 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
1364  switch (I.getIntrinsicID()) {
1365  case Intrinsic::bitreverse:
1366  return visitBitreverseIntrinsicInst(I);
1367  default:
1368  return false;
1369  }
1370 }
1371 
1372 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
1373  bool Changed = false;
1374 
1375  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1376  DA->isUniform(&I))
1377  Changed |= promoteUniformBitreverseToI32(I);
1378 
1379  return Changed;
1380 }
1381 
1382 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
1383  Mod = &M;
1384  DL = &Mod->getDataLayout();
1385  return false;
1386 }
1387 
1389  if (skipFunction(F))
1390  return false;
1391 
1392  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1393  if (!TPC)
1394  return false;
1395 
1396  const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
1397  ST = &TM.getSubtarget<GCNSubtarget>(F);
1398  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1399  DA = &getAnalysis<LegacyDivergenceAnalysis>();
1400 
1401  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1402  DT = DTWP ? &DTWP->getDomTree() : nullptr;
1403 
1404  HasUnsafeFPMath = hasUnsafeFPMath(F);
1405 
1407  HasFP32Denormals = Mode.allFP32Denormals();
1408 
1409  bool MadeChange = false;
1410 
1411  Function::iterator NextBB;
1412  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
1413  BasicBlock *BB = &*FI;
1414  NextBB = std::next(FI);
1415 
1416  BasicBlock::iterator Next;
1417  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
1418  Next = std::next(I);
1419 
1420  MadeChange |= visit(*I);
1421 
1422  if (Next != E) { // Control flow changed
1423  BasicBlock *NextInstBB = Next->getParent();
1424  if (NextInstBB != BB) {
1425  BB = NextInstBB;
1426  E = BB->end();
1427  FE = F.end();
1428  }
1429  }
1430  }
1431  }
1432 
1433  return MadeChange;
1434 }
1435 
1436 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
1437  "AMDGPU IR optimizations", false, false)
1440 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
1442 
1443 char AMDGPUCodeGenPrepare::ID = 0;
1444 
1446  return new AMDGPUCodeGenPrepare();
1447 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
AssumptionCache.h
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4636
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::wasm::ValType::I32
@ I32
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:435
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:90
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::Function
Definition: Function.h:61
llvm::Attribute
Definition: Attributes.h:52
isI24
static bool isI24(SDValue Op, SelectionDAG &DAG)
Definition: AMDGPUISelLowering.cpp:2889
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
getMul64
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:842
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:659
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:691
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:319
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::IRBuilder<>
ValueTracking.h
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:151
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
extractValues
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
Definition: AMDGPUCodeGenPrepare.cpp:467
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:911
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:419
llvm::ComputeNumSignBits
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition: ValueTracking.cpp:380
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1787
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:829
llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition: Intrinsics.h:45
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:161
ConstantFolding.h
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
llvm::Attribute::getValueAsBool
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:287
optimizeWithFDivFast
static Value * optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, bool HasDenormals, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:699
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1208
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:193
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1336
KnownBits.h
insertValues
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
Definition: AMDGPUCodeGenPrepare.cpp:479
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::MipsISD::Hi
@ Hi
Definition: MipsISelLowering.h:75
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:206
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::KnownBits::isNonNegative
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:99
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUCodeGenPrepare.cpp:31
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:29
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1785
llvm::FastMathFlags::approxFunc
bool approxFunc() const
Definition: Operator.h:212
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:357
f
Itanium Name Demangler i e convert the string _Z1fv into f()". You can also use the CRTP base ManglingParser to perform some simple analysis on the mangled name
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::ConstantFoldBinaryOpOperands
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
Definition: ConstantFolding.cpp:1330
llvm::ConstantFoldCastOperand
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Definition: ConstantFolding.cpp:1341
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:199
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:142
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
optimizeWithRcp
static Value * optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, bool RcpIsAccurate, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:640
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:405
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:393
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:153
llvm::M68kBeads::DA
@ DA
Definition: M68kBaseInfo.h:59
llvm::ConstantFP
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:257
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1771
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:900
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
IR
Statically lint checks LLVM IR
Definition: Lint.cpp:744
optimizations
AMDGPU IR optimizations
Definition: AMDGPUCodeGenPrepare.cpp:1440
llvm::KnownBits::isNegative
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:96
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:648
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
hasUnsafeFPMath
static bool hasUnsafeFPMath(const Function &F)
Definition: AMDGPUCodeGenPrepare.cpp:837
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:282
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::LegacyLegalizeActions::Lower
@ Lower
The operation itself must be expressed in terms of simpler actions on this target.
Definition: LegacyLegalizerInfo.h:58
llvm::cl::opt< bool >
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:470
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1203
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
promotedOpIsNUW
static bool promotedOpIsNUW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:293
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::SelectInst::getTrueValue
const Value * getTrueValue() const
Definition: Instructions.h:1786
TargetPassConfig.h
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:141
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:213
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::FPMathOperator
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:250
llvm::KnownBits::countMinLeadingZeros
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:236
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1738
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::BinaryOperator
Definition: InstrTypes.h:189
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::expandDivisionUpTo64Bits
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
Definition: IntegerDivision.cpp:631
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
InstVisitor.h
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:353
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:79
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::FastMathFlags::setFast
void setFast(bool B=true)
Definition: Operator.h:238
llvm::Instruction::copyFastMathFlags
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
Definition: Instruction.cpp:235
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:430
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:148
llvm::BitsToFloat
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
Definition: MathExtras.h:643
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
getMulHu
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:856
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::expandRemainderUpTo64Bits
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
Definition: IntegerDivision.cpp:534
llvm::AMDGPUISD::RCP
@ RCP
Definition: AMDGPUISelLowering.h:408
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:348
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:147
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::FPMathOperator::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:368
findSelectThroughCast
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
Definition: AMDGPUCodeGenPrepare.cpp:560
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:321
llvm::ConstantFP::get
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:947
llvm::Module::getContext
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:261
LowAndHigh
Metadata * LowAndHigh[]
Definition: NVVMIntrRange.cpp:68
llvm::Type::isDoubleTy
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:150
llvm::ARCCC::Z
@ Z
Definition: ARCInfo.h:41
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
IntegerDivision.h
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
promotedOpIsNSW
static bool promotedOpIsNSW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:279
llvm::createAMDGPUCodeGenPreparePass
FunctionPass * createAMDGPUCodeGenPreparePass()
Definition: AMDGPUCodeGenPrepare.cpp:1445
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:785
llvm::FPMathOperator::getFPAccuracy
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Definition: Instructions.cpp:2759
LegacyDivergenceAnalysis.h
WidenLoads
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:370
isU24
static bool isU24(SDValue Op, SelectionDAG &DAG)
Definition: AMDGPUISelLowering.cpp:2885
Dominators.h
getSign32
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
Definition: AMDGPUCodeGenPrepare.cpp:1030
N
#define N
llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:677
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::FunctionCallee
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:165
llvm::SmallVectorImpl< Value * >
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::isKnownToBeAPowerOfTwo
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
Definition: ValueTracking.cpp:292
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:382
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:414
Mod
Module * Mod
Definition: PassBuilderBindings.cpp:54
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
AMDGPUTargetMachine.h
llvm::Function::iterator
BasicBlockListType::iterator iterator
Definition: Function.h:67
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare