LLVM  15.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
22 #include "llvm/IR/Dominators.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/InitializePasses.h"
27 #include "llvm/Pass.h"
28 #include "llvm/Support/KnownBits.h"
30 
31 #define DEBUG_TYPE "amdgpu-codegenprepare"
32 
33 using namespace llvm;
34 
35 namespace {
36 
38  "amdgpu-codegenprepare-widen-constant-loads",
39  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
41  cl::init(false));
42 
43 static cl::opt<bool> Widen16BitOps(
44  "amdgpu-codegenprepare-widen-16-bit-ops",
45  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
47  cl::init(true));
48 
49 static cl::opt<bool> UseMul24Intrin(
50  "amdgpu-codegenprepare-mul24",
51  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
53  cl::init(true));
54 
55 // Legalize 64-bit division by using the generic IR expansion.
56 static cl::opt<bool> ExpandDiv64InIR(
57  "amdgpu-codegenprepare-expand-div64",
58  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
60  cl::init(false));
61 
62 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
63 // and is used for testing the legalizer.
64 static cl::opt<bool> DisableIDivExpand(
65  "amdgpu-codegenprepare-disable-idiv-expansion",
66  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
68  cl::init(false));
69 
70 class AMDGPUCodeGenPrepare : public FunctionPass,
71  public InstVisitor<AMDGPUCodeGenPrepare, bool> {
72  const GCNSubtarget *ST = nullptr;
73  AssumptionCache *AC = nullptr;
74  DominatorTree *DT = nullptr;
75  LegacyDivergenceAnalysis *DA = nullptr;
76  Module *Mod = nullptr;
77  const DataLayout *DL = nullptr;
78  bool HasUnsafeFPMath = false;
79  bool HasFP32Denormals = false;
80 
81  /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
82  /// binary operation \p V.
83  ///
84  /// \returns Binary operation \p V.
85  /// \returns \p T's base element bit width.
86  unsigned getBaseElementBitWidth(const Type *T) const;
87 
88  /// \returns Equivalent 32 bit integer type for given type \p T. For example,
89  /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
90  /// is returned.
91  Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
92 
93  /// \returns True if binary operation \p I is a signed binary operation, false
94  /// otherwise.
95  bool isSigned(const BinaryOperator &I) const;
96 
97  /// \returns True if the condition of 'select' operation \p I comes from a
98  /// signed 'icmp' operation, false otherwise.
99  bool isSigned(const SelectInst &I) const;
100 
101  /// \returns True if type \p T needs to be promoted to 32 bit integer type,
102  /// false otherwise.
103  bool needsPromotionToI32(const Type *T) const;
104 
105  /// Promotes uniform binary operation \p I to equivalent 32 bit binary
106  /// operation.
107  ///
108  /// \details \p I's base element bit width must be greater than 1 and less
109  /// than or equal 16. Promotion is done by sign or zero extending operands to
110  /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
111  /// truncating the result of 32 bit binary operation back to \p I's original
112  /// type. Division operation is not promoted.
113  ///
114  /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
115  /// false otherwise.
116  bool promoteUniformOpToI32(BinaryOperator &I) const;
117 
118  /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
119  ///
120  /// \details \p I's base element bit width must be greater than 1 and less
121  /// than or equal 16. Promotion is done by sign or zero extending operands to
122  /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
123  ///
124  /// \returns True.
125  bool promoteUniformOpToI32(ICmpInst &I) const;
126 
127  /// Promotes uniform 'select' operation \p I to 32 bit 'select'
128  /// operation.
129  ///
130  /// \details \p I's base element bit width must be greater than 1 and less
131  /// than or equal 16. Promotion is done by sign or zero extending operands to
132  /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
133  /// result of 32 bit 'select' operation back to \p I's original type.
134  ///
135  /// \returns True.
136  bool promoteUniformOpToI32(SelectInst &I) const;
137 
138  /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
139  /// intrinsic.
140  ///
141  /// \details \p I's base element bit width must be greater than 1 and less
142  /// than or equal 16. Promotion is done by zero extending the operand to 32
143  /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
144  /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
145  /// shift amount is 32 minus \p I's base element bit width), and truncating
146  /// the result of the shift operation back to \p I's original type.
147  ///
148  /// \returns True.
149  bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
150 
151  /// \returns The minimum number of bits needed to store the value of \Op as an
152  /// unsigned integer. Truncating to this size and then zero-extending to
153  /// the original will not change the value.
154  unsigned numBitsUnsigned(Value *Op) const;
155 
156  /// \returns The minimum number of bits needed to store the value of \Op as a
157  /// signed integer. Truncating to this size and then sign-extending to
158  /// the original size will not change the value.
159  unsigned numBitsSigned(Value *Op) const;
160 
161  /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
162  /// SelectionDAG has an issue where an and asserting the bits are known
163  bool replaceMulWithMul24(BinaryOperator &I) const;
164 
165  /// Perform same function as equivalently named function in DAGCombiner. Since
166  /// we expand some divisions here, we need to perform this before obscuring.
167  bool foldBinOpIntoSelect(BinaryOperator &I) const;
168 
169  bool divHasSpecialOptimization(BinaryOperator &I,
170  Value *Num, Value *Den) const;
171  int getDivNumBits(BinaryOperator &I,
172  Value *Num, Value *Den,
173  unsigned AtLeast, bool Signed) const;
174 
175  /// Expands 24 bit div or rem.
176  Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
177  Value *Num, Value *Den,
178  bool IsDiv, bool IsSigned) const;
179 
180  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
181  Value *Num, Value *Den, unsigned NumBits,
182  bool IsDiv, bool IsSigned) const;
183 
184  /// Expands 32 bit div or rem.
185  Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
186  Value *Num, Value *Den) const;
187 
188  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
189  Value *Num, Value *Den) const;
190  void expandDivRem64(BinaryOperator &I) const;
191 
192  /// Widen a scalar load.
193  ///
194  /// \details \p Widen scalar load for uniform, small type loads from constant
195  // memory / to a full 32-bits and then truncate the input to allow a scalar
196  // load instead of a vector load.
197  //
198  /// \returns True.
199 
200  bool canWidenScalarExtLoad(LoadInst &I) const;
201 
202 public:
203  static char ID;
204 
205  AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
206 
207  bool visitFDiv(BinaryOperator &I);
208  bool visitXor(BinaryOperator &I);
209 
210  bool visitInstruction(Instruction &I) { return false; }
211  bool visitBinaryOperator(BinaryOperator &I);
212  bool visitLoadInst(LoadInst &I);
213  bool visitICmpInst(ICmpInst &I);
214  bool visitSelectInst(SelectInst &I);
215 
216  bool visitIntrinsicInst(IntrinsicInst &I);
217  bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
218 
219  bool doInitialization(Module &M) override;
220  bool runOnFunction(Function &F) override;
221 
222  StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
223 
224  void getAnalysisUsage(AnalysisUsage &AU) const override {
227 
228  // FIXME: Division expansion needs to preserve the dominator tree.
229  if (!ExpandDiv64InIR)
230  AU.setPreservesAll();
231  }
232 };
233 
234 } // end anonymous namespace
235 
236 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
237  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
238 
239  if (T->isIntegerTy())
240  return T->getIntegerBitWidth();
241  return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
242 }
243 
244 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
245  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
246 
247  if (T->isIntegerTy())
248  return B.getInt32Ty();
249  return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
250 }
251 
252 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
253  return I.getOpcode() == Instruction::AShr ||
254  I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
255 }
256 
257 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
258  return isa<ICmpInst>(I.getOperand(0)) ?
259  cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
260 }
261 
262 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
263  if (!Widen16BitOps)
264  return false;
265 
266  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
267  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
268  return true;
269 
270  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
271  // TODO: The set of packed operations is more limited, so may want to
272  // promote some anyway.
273  if (ST->hasVOP3PInsts())
274  return false;
275 
276  return needsPromotionToI32(VT->getElementType());
277  }
278 
279  return false;
280 }
281 
282 // Return true if the op promoted to i32 should have nsw set.
283 static bool promotedOpIsNSW(const Instruction &I) {
284  switch (I.getOpcode()) {
285  case Instruction::Shl:
286  case Instruction::Add:
287  case Instruction::Sub:
288  return true;
289  case Instruction::Mul:
290  return I.hasNoUnsignedWrap();
291  default:
292  return false;
293  }
294 }
295 
296 // Return true if the op promoted to i32 should have nuw set.
297 static bool promotedOpIsNUW(const Instruction &I) {
298  switch (I.getOpcode()) {
299  case Instruction::Shl:
300  case Instruction::Add:
301  case Instruction::Mul:
302  return true;
303  case Instruction::Sub:
304  return I.hasNoUnsignedWrap();
305  default:
306  return false;
307  }
308 }
309 
310 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
311  Type *Ty = I.getType();
312  const DataLayout &DL = Mod->getDataLayout();
313  int TySize = DL.getTypeSizeInBits(Ty);
314  Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
315 
316  return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
317 }
318 
319 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
320  assert(needsPromotionToI32(I.getType()) &&
321  "I does not need promotion to i32");
322 
323  if (I.getOpcode() == Instruction::SDiv ||
324  I.getOpcode() == Instruction::UDiv ||
325  I.getOpcode() == Instruction::SRem ||
326  I.getOpcode() == Instruction::URem)
327  return false;
328 
330  Builder.SetCurrentDebugLocation(I.getDebugLoc());
331 
332  Type *I32Ty = getI32Ty(Builder, I.getType());
333  Value *ExtOp0 = nullptr;
334  Value *ExtOp1 = nullptr;
335  Value *ExtRes = nullptr;
336  Value *TruncRes = nullptr;
337 
338  if (isSigned(I)) {
339  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
340  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
341  } else {
342  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
343  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
344  }
345 
346  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
347  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
348  if (promotedOpIsNSW(cast<Instruction>(I)))
349  Inst->setHasNoSignedWrap();
350 
351  if (promotedOpIsNUW(cast<Instruction>(I)))
352  Inst->setHasNoUnsignedWrap();
353 
354  if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
355  Inst->setIsExact(ExactOp->isExact());
356  }
357 
358  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
359 
360  I.replaceAllUsesWith(TruncRes);
361  I.eraseFromParent();
362 
363  return true;
364 }
365 
366 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
367  assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
368  "I does not need promotion to i32");
369 
371  Builder.SetCurrentDebugLocation(I.getDebugLoc());
372 
373  Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
374  Value *ExtOp0 = nullptr;
375  Value *ExtOp1 = nullptr;
376  Value *NewICmp = nullptr;
377 
378  if (I.isSigned()) {
379  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
380  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
381  } else {
382  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
383  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
384  }
385  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
386 
387  I.replaceAllUsesWith(NewICmp);
388  I.eraseFromParent();
389 
390  return true;
391 }
392 
393 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
394  assert(needsPromotionToI32(I.getType()) &&
395  "I does not need promotion to i32");
396 
398  Builder.SetCurrentDebugLocation(I.getDebugLoc());
399 
400  Type *I32Ty = getI32Ty(Builder, I.getType());
401  Value *ExtOp1 = nullptr;
402  Value *ExtOp2 = nullptr;
403  Value *ExtRes = nullptr;
404  Value *TruncRes = nullptr;
405 
406  if (isSigned(I)) {
407  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
408  ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
409  } else {
410  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
411  ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
412  }
413  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
414  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
415 
416  I.replaceAllUsesWith(TruncRes);
417  I.eraseFromParent();
418 
419  return true;
420 }
421 
422 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
423  IntrinsicInst &I) const {
424  assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
425  "I must be bitreverse intrinsic");
426  assert(needsPromotionToI32(I.getType()) &&
427  "I does not need promotion to i32");
428 
430  Builder.SetCurrentDebugLocation(I.getDebugLoc());
431 
432  Type *I32Ty = getI32Ty(Builder, I.getType());
433  Function *I32 =
434  Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
435  Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
436  Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
437  Value *LShrOp =
438  Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
439  Value *TruncRes =
440  Builder.CreateTrunc(LShrOp, I.getType());
441 
442  I.replaceAllUsesWith(TruncRes);
443  I.eraseFromParent();
444 
445  return true;
446 }
447 
448 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const {
449  return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
450 }
451 
452 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const {
453  return ComputeMaxSignificantBits(Op, *DL, 0, AC);
454 }
455 
457  SmallVectorImpl<Value *> &Values, Value *V) {
458  auto *VT = dyn_cast<FixedVectorType>(V->getType());
459  if (!VT) {
460  Values.push_back(V);
461  return;
462  }
463 
464  for (int I = 0, E = VT->getNumElements(); I != E; ++I)
465  Values.push_back(Builder.CreateExtractElement(V, I));
466 }
467 
469  Type *Ty,
470  SmallVectorImpl<Value *> &Values) {
471  if (Values.size() == 1)
472  return Values[0];
473 
474  Value *NewVal = UndefValue::get(Ty);
475  for (int I = 0, E = Values.size(); I != E; ++I)
476  NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
477 
478  return NewVal;
479 }
480 
481 // Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
482 // `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
483 // width of the original destination.
485  unsigned Size, unsigned NumBits, bool IsSigned) {
486  if (Size <= 32 || NumBits <= 32) {
487  Intrinsic::ID ID =
488  IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
489  return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
490  }
491 
492  assert(NumBits <= 48);
493 
494  Intrinsic::ID LoID =
495  IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
496  Intrinsic::ID HiID =
497  IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
498 
499  Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
500  Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
501 
502  IntegerType *I64Ty = Builder.getInt64Ty();
503  Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
504  Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
505 
506  return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
507 }
508 
509 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
510  if (I.getOpcode() != Instruction::Mul)
511  return false;
512 
513  Type *Ty = I.getType();
514  unsigned Size = Ty->getScalarSizeInBits();
515  if (Size <= 16 && ST->has16BitInsts())
516  return false;
517 
518  // Prefer scalar if this could be s_mul_i32
519  if (DA->isUniform(&I))
520  return false;
521 
522  Value *LHS = I.getOperand(0);
523  Value *RHS = I.getOperand(1);
525  Builder.SetCurrentDebugLocation(I.getDebugLoc());
526 
527  unsigned LHSBits = 0, RHSBits = 0;
528  bool IsSigned = false;
529 
530  if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
531  (RHSBits = numBitsUnsigned(RHS)) <= 24) {
532  IsSigned = false;
533 
534  } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
535  (RHSBits = numBitsSigned(RHS)) <= 24) {
536  IsSigned = true;
537 
538  } else
539  return false;
540 
541  SmallVector<Value *, 4> LHSVals;
542  SmallVector<Value *, 4> RHSVals;
543  SmallVector<Value *, 4> ResultVals;
544  extractValues(Builder, LHSVals, LHS);
545  extractValues(Builder, RHSVals, RHS);
546 
547  IntegerType *I32Ty = Builder.getInt32Ty();
548  for (int I = 0, E = LHSVals.size(); I != E; ++I) {
549  Value *LHS, *RHS;
550  if (IsSigned) {
551  LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
552  RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
553  } else {
554  LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
555  RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
556  }
557 
558  Value *Result =
559  getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
560 
561  if (IsSigned) {
562  ResultVals.push_back(
563  Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
564  } else {
565  ResultVals.push_back(
566  Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
567  }
568  }
569 
570  Value *NewVal = insertValues(Builder, Ty, ResultVals);
571  NewVal->takeName(&I);
572  I.replaceAllUsesWith(NewVal);
573  I.eraseFromParent();
574 
575  return true;
576 }
577 
578 // Find a select instruction, which may have been casted. This is mostly to deal
579 // with cases where i16 selects were promoted here to i32.
581  Cast = nullptr;
582  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
583  return Sel;
584 
585  if ((Cast = dyn_cast<CastInst>(V))) {
586  if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
587  return Sel;
588  }
589 
590  return nullptr;
591 }
592 
593 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
594  // Don't do this unless the old select is going away. We want to eliminate the
595  // binary operator, not replace a binop with a select.
596  int SelOpNo = 0;
597 
598  CastInst *CastOp;
599 
600  // TODO: Should probably try to handle some cases with multiple
601  // users. Duplicating the select may be profitable for division.
602  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
603  if (!Sel || !Sel->hasOneUse()) {
604  SelOpNo = 1;
605  Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
606  }
607 
608  if (!Sel || !Sel->hasOneUse())
609  return false;
610 
611  Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
612  Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
613  Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
614  if (!CBO || !CT || !CF)
615  return false;
616 
617  if (CastOp) {
618  if (!CastOp->hasOneUse())
619  return false;
620  CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
621  CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
622  }
623 
624  // TODO: Handle special 0/-1 cases DAG combine does, although we only really
625  // need to handle divisions here.
626  Constant *FoldedT = SelOpNo ?
627  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
628  ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
629  if (isa<ConstantExpr>(FoldedT))
630  return false;
631 
632  Constant *FoldedF = SelOpNo ?
633  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
634  ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
635  if (isa<ConstantExpr>(FoldedF))
636  return false;
637 
638  IRBuilder<> Builder(&BO);
639  Builder.SetCurrentDebugLocation(BO.getDebugLoc());
640  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
641  Builder.setFastMathFlags(FPOp->getFastMathFlags());
642 
643  Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
644  FoldedT, FoldedF);
645  NewSelect->takeName(&BO);
646  BO.replaceAllUsesWith(NewSelect);
647  BO.eraseFromParent();
648  if (CastOp)
649  CastOp->eraseFromParent();
650  Sel->eraseFromParent();
651  return true;
652 }
653 
654 // Optimize fdiv with rcp:
655 //
656 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
657 // allowed with unsafe-fp-math or afn.
658 //
659 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
660 static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
661  bool RcpIsAccurate, IRBuilder<> &Builder,
662  Module *Mod) {
663 
664  if (!AllowInaccurateRcp && !RcpIsAccurate)
665  return nullptr;
666 
667  Type *Ty = Den->getType();
668  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
669  if (AllowInaccurateRcp || RcpIsAccurate) {
670  if (CLHS->isExactlyValue(1.0)) {
672  Mod, Intrinsic::amdgcn_rcp, Ty);
673 
674  // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
675  // the CI documentation has a worst case error of 1 ulp.
676  // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
677  // use it as long as we aren't trying to use denormals.
678  //
679  // v_rcp_f16 and v_rsq_f16 DO support denormals.
680 
681  // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
682  // insert rsq intrinsic here.
683 
684  // 1.0 / x -> rcp(x)
685  return Builder.CreateCall(Decl, { Den });
686  }
687 
688  // Same as for 1.0, but expand the sign out of the constant.
689  if (CLHS->isExactlyValue(-1.0)) {
691  Mod, Intrinsic::amdgcn_rcp, Ty);
692 
693  // -1.0 / x -> rcp (fneg x)
694  Value *FNeg = Builder.CreateFNeg(Den);
695  return Builder.CreateCall(Decl, { FNeg });
696  }
697  }
698  }
699 
700  if (AllowInaccurateRcp) {
702  Mod, Intrinsic::amdgcn_rcp, Ty);
703 
704  // Turn into multiply by the reciprocal.
705  // x / y -> x * (1.0 / y)
706  Value *Recip = Builder.CreateCall(Decl, { Den });
707  return Builder.CreateFMul(Num, Recip);
708  }
709  return nullptr;
710 }
711 
712 // optimize with fdiv.fast:
713 //
714 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
715 //
716 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
717 //
718 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
719 static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
720  bool HasDenormals, IRBuilder<> &Builder,
721  Module *Mod) {
722  // fdiv.fast can achieve 2.5 ULP accuracy.
723  if (ReqdAccuracy < 2.5f)
724  return nullptr;
725 
726  // Only have fdiv.fast for f32.
727  Type *Ty = Den->getType();
728  if (!Ty->isFloatTy())
729  return nullptr;
730 
731  bool NumIsOne = false;
732  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
733  if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
734  NumIsOne = true;
735  }
736 
737  // fdiv does not support denormals. But 1.0/x is always fine to use it.
738  if (HasDenormals && !NumIsOne)
739  return nullptr;
740 
741  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
742  return Builder.CreateCall(Decl, { Num, Den });
743 }
744 
745 // Optimizations is performed based on fpmath, fast math flags as well as
746 // denormals to optimize fdiv with either rcp or fdiv.fast.
747 //
748 // With rcp:
749 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
750 // allowed with unsafe-fp-math or afn.
751 //
752 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
753 //
754 // With fdiv.fast:
755 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
756 //
757 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
758 //
759 // NOTE: rcp is the preference in cases that both are legal.
760 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
761 
762  Type *Ty = FDiv.getType()->getScalarType();
763 
764  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
765  // expansion around them in codegen.
766  if (Ty->isDoubleTy())
767  return false;
768 
769  // No intrinsic for fdiv16 if target does not support f16.
770  if (Ty->isHalfTy() && !ST->has16BitInsts())
771  return false;
772 
773  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
774  const float ReqdAccuracy = FPOp->getFPAccuracy();
775 
776  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
777  FastMathFlags FMF = FPOp->getFastMathFlags();
778  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
779 
780  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
781  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
782  // rcp_f64 is never accurate.
783  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
784  (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
785 
786  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
787  Builder.setFastMathFlags(FMF);
788  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
789 
790  Value *Num = FDiv.getOperand(0);
791  Value *Den = FDiv.getOperand(1);
792 
793  Value *NewFDiv = nullptr;
794  if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
795  NewFDiv = UndefValue::get(VT);
796 
797  // FIXME: Doesn't do the right thing for cases where the vector is partially
798  // constant. This works when the scalarizer pass is run first.
799  for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
800  Value *NumEltI = Builder.CreateExtractElement(Num, I);
801  Value *DenEltI = Builder.CreateExtractElement(Den, I);
802  // Try rcp first.
803  Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
804  RcpIsAccurate, Builder, Mod);
805  if (!NewElt) // Try fdiv.fast.
806  NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
807  HasFP32Denormals, Builder, Mod);
808  if (!NewElt) // Keep the original.
809  NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
810 
811  NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
812  }
813  } else { // Scalar FDiv.
814  // Try rcp first.
815  NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
816  Builder, Mod);
817  if (!NewFDiv) { // Try fdiv.fast.
818  NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
819  Builder, Mod);
820  }
821  }
822 
823  if (NewFDiv) {
824  FDiv.replaceAllUsesWith(NewFDiv);
825  NewFDiv->takeName(&FDiv);
826  FDiv.eraseFromParent();
827  }
828 
829  return !!NewFDiv;
830 }
831 
832 bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
833  // Match the Xor instruction, its type and its operands
834  IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
835  ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
836  if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
837  return visitBinaryOperator(I);
838 
839  // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
840  // has only one use
841  if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
842  !IntrinsicCall->hasOneUse())
843  return visitBinaryOperator(I);
844 
845  // "Not" the second argument of the intrinsic call
846  ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
847  if (!Arg)
848  return visitBinaryOperator(I);
849 
850  IntrinsicCall->setOperand(
851  1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
852  I.replaceAllUsesWith(IntrinsicCall);
853  I.eraseFromParent();
854  return true;
855 }
856 
857 static bool hasUnsafeFPMath(const Function &F) {
858  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
859  return Attr.getValueAsBool();
860 }
861 
862 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
863  Value *LHS, Value *RHS) {
864  Type *I32Ty = Builder.getInt32Ty();
865  Type *I64Ty = Builder.getInt64Ty();
866 
867  Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
868  Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
869  Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
870  Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
871  Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
872  Hi = Builder.CreateTrunc(Hi, I32Ty);
873  return std::make_pair(Lo, Hi);
874 }
875 
877  return getMul64(Builder, LHS, RHS).second;
878 }
879 
880 /// Figure out how many bits are really needed for this division. \p AtLeast is
881 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
882 /// first one is insufficient. Returns -1 on failure.
883 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
884  Value *Num, Value *Den,
885  unsigned AtLeast, bool IsSigned) const {
886  const DataLayout &DL = Mod->getDataLayout();
887  unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
888  if (LHSSignBits < AtLeast)
889  return -1;
890 
891  unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
892  if (RHSSignBits < AtLeast)
893  return -1;
894 
895  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
896  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
897  if (IsSigned)
898  ++DivBits;
899  return DivBits;
900 }
901 
902 // The fractional part of a float is enough to accurately represent up to
903 // a 24-bit signed integer.
904 Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
905  BinaryOperator &I,
906  Value *Num, Value *Den,
907  bool IsDiv, bool IsSigned) const {
908  int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
909  if (DivBits == -1)
910  return nullptr;
911  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
912 }
913 
914 Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
915  BinaryOperator &I,
916  Value *Num, Value *Den,
917  unsigned DivBits,
918  bool IsDiv, bool IsSigned) const {
919  Type *I32Ty = Builder.getInt32Ty();
920  Num = Builder.CreateTrunc(Num, I32Ty);
921  Den = Builder.CreateTrunc(Den, I32Ty);
922 
923  Type *F32Ty = Builder.getFloatTy();
924  ConstantInt *One = Builder.getInt32(1);
925  Value *JQ = One;
926 
927  if (IsSigned) {
928  // char|short jq = ia ^ ib;
929  JQ = Builder.CreateXor(Num, Den);
930 
931  // jq = jq >> (bitsize - 2)
932  JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
933 
934  // jq = jq | 0x1
935  JQ = Builder.CreateOr(JQ, One);
936  }
937 
938  // int ia = (int)LHS;
939  Value *IA = Num;
940 
941  // int ib, (int)RHS;
942  Value *IB = Den;
943 
944  // float fa = (float)ia;
945  Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
946  : Builder.CreateUIToFP(IA, F32Ty);
947 
948  // float fb = (float)ib;
949  Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
950  : Builder.CreateUIToFP(IB,F32Ty);
951 
952  Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
953  Builder.getFloatTy());
954  Value *RCP = Builder.CreateCall(RcpDecl, { FB });
955  Value *FQM = Builder.CreateFMul(FA, RCP);
956 
957  // fq = trunc(fqm);
958  CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
959  FQ->copyFastMathFlags(Builder.getFastMathFlags());
960 
961  // float fqneg = -fq;
962  Value *FQNeg = Builder.CreateFNeg(FQ);
963 
964  // float fr = mad(fqneg, fb, fa);
965  auto FMAD = !ST->hasMadMacF32Insts()
966  ? Intrinsic::fma
967  : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
968  Value *FR = Builder.CreateIntrinsic(FMAD,
969  {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
970 
971  // int iq = (int)fq;
972  Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
973  : Builder.CreateFPToUI(FQ, I32Ty);
974 
975  // fr = fabs(fr);
976  FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
977 
978  // fb = fabs(fb);
979  FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
980 
981  // int cv = fr >= fb;
982  Value *CV = Builder.CreateFCmpOGE(FR, FB);
983 
984  // jq = (cv ? jq : 0);
985  JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
986 
987  // dst = iq + jq;
988  Value *Div = Builder.CreateAdd(IQ, JQ);
989 
990  Value *Res = Div;
991  if (!IsDiv) {
992  // Rem needs compensation, it's easier to recompute it
993  Value *Rem = Builder.CreateMul(Div, Den);
994  Res = Builder.CreateSub(Num, Rem);
995  }
996 
997  if (DivBits != 0 && DivBits < 32) {
998  // Extend in register from the number of bits this divide really is.
999  if (IsSigned) {
1000  int InRegBits = 32 - DivBits;
1001 
1002  Res = Builder.CreateShl(Res, InRegBits);
1003  Res = Builder.CreateAShr(Res, InRegBits);
1004  } else {
1005  ConstantInt *TruncMask
1006  = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1007  Res = Builder.CreateAnd(Res, TruncMask);
1008  }
1009  }
1010 
1011  return Res;
1012 }
1013 
1014 // Try to recognize special cases the DAG will emit special, better expansions
1015 // than the general expansion we do here.
1016 
1017 // TODO: It would be better to just directly handle those optimizations here.
1018 bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
1019  BinaryOperator &I, Value *Num, Value *Den) const {
1020  if (Constant *C = dyn_cast<Constant>(Den)) {
1021  // Arbitrary constants get a better expansion as long as a wider mulhi is
1022  // legal.
1023  if (C->getType()->getScalarSizeInBits() <= 32)
1024  return true;
1025 
1026  // TODO: Sdiv check for not exact for some reason.
1027 
1028  // If there's no wider mulhi, there's only a better expansion for powers of
1029  // two.
1030  // TODO: Should really know for each vector element.
1031  if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1032  return true;
1033 
1034  return false;
1035  }
1036 
1037  if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1038  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1039  if (BinOpDen->getOpcode() == Instruction::Shl &&
1040  isa<Constant>(BinOpDen->getOperand(0)) &&
1041  isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1042  0, AC, &I, DT)) {
1043  return true;
1044  }
1045  }
1046 
1047  return false;
1048 }
1049 
1051  // Check whether the sign can be determined statically.
1052  KnownBits Known = computeKnownBits(V, *DL);
1053  if (Known.isNegative())
1054  return Constant::getAllOnesValue(V->getType());
1055  if (Known.isNonNegative())
1056  return Constant::getNullValue(V->getType());
1057  return Builder.CreateAShr(V, Builder.getInt32(31));
1058 }
1059 
1060 Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
1061  BinaryOperator &I, Value *X,
1062  Value *Y) const {
1063  Instruction::BinaryOps Opc = I.getOpcode();
1064  assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1065  Opc == Instruction::SRem || Opc == Instruction::SDiv);
1066 
1067  FastMathFlags FMF;
1068  FMF.setFast();
1069  Builder.setFastMathFlags(FMF);
1070 
1071  if (divHasSpecialOptimization(I, X, Y))
1072  return nullptr; // Keep it for later optimization.
1073 
1074  bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1075  bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1076 
1077  Type *Ty = X->getType();
1078  Type *I32Ty = Builder.getInt32Ty();
1079  Type *F32Ty = Builder.getFloatTy();
1080 
1081  if (Ty->getScalarSizeInBits() < 32) {
1082  if (IsSigned) {
1083  X = Builder.CreateSExt(X, I32Ty);
1084  Y = Builder.CreateSExt(Y, I32Ty);
1085  } else {
1086  X = Builder.CreateZExt(X, I32Ty);
1087  Y = Builder.CreateZExt(Y, I32Ty);
1088  }
1089  }
1090 
1091  if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1092  return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1093  Builder.CreateZExtOrTrunc(Res, Ty);
1094  }
1095 
1096  ConstantInt *Zero = Builder.getInt32(0);
1097  ConstantInt *One = Builder.getInt32(1);
1098 
1099  Value *Sign = nullptr;
1100  if (IsSigned) {
1101  Value *SignX = getSign32(X, Builder, DL);
1102  Value *SignY = getSign32(Y, Builder, DL);
1103  // Remainder sign is the same as LHS
1104  Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1105 
1106  X = Builder.CreateAdd(X, SignX);
1107  Y = Builder.CreateAdd(Y, SignY);
1108 
1109  X = Builder.CreateXor(X, SignX);
1110  Y = Builder.CreateXor(Y, SignY);
1111  }
1112 
1113  // The algorithm here is based on ideas from "Software Integer Division", Tom
1114  // Rodeheffer, August 2008.
1115  //
1116  // unsigned udiv(unsigned x, unsigned y) {
1117  // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1118  // // that this is a lower bound on inv(y), even if some of the calculations
1119  // // round up.
1120  // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1121  //
1122  // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1123  // // Empirically this is guaranteed to give a "two-y" lower bound on
1124  // // inv(y).
1125  // z += umulh(z, -y * z);
1126  //
1127  // // Quotient/remainder estimate.
1128  // unsigned q = umulh(x, z);
1129  // unsigned r = x - q * y;
1130  //
1131  // // Two rounds of quotient/remainder refinement.
1132  // if (r >= y) {
1133  // ++q;
1134  // r -= y;
1135  // }
1136  // if (r >= y) {
1137  // ++q;
1138  // r -= y;
1139  // }
1140  //
1141  // return q;
1142  // }
1143 
1144  // Initial estimate of inv(y).
1145  Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1146  Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1147  Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1148  Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
1149  Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1150  Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1151 
1152  // One round of UNR.
1153  Value *NegY = Builder.CreateSub(Zero, Y);
1154  Value *NegYZ = Builder.CreateMul(NegY, Z);
1155  Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1156 
1157  // Quotient/remainder estimate.
1158  Value *Q = getMulHu(Builder, X, Z);
1159  Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1160 
1161  // First quotient/remainder refinement.
1162  Value *Cond = Builder.CreateICmpUGE(R, Y);
1163  if (IsDiv)
1164  Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1165  R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1166 
1167  // Second quotient/remainder refinement.
1168  Cond = Builder.CreateICmpUGE(R, Y);
1169  Value *Res;
1170  if (IsDiv)
1171  Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1172  else
1173  Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1174 
1175  if (IsSigned) {
1176  Res = Builder.CreateXor(Res, Sign);
1177  Res = Builder.CreateSub(Res, Sign);
1178  }
1179 
1180  Res = Builder.CreateTrunc(Res, Ty);
1181 
1182  return Res;
1183 }
1184 
1185 Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
1186  BinaryOperator &I,
1187  Value *Num, Value *Den) const {
1188  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1189  return nullptr; // Keep it for later optimization.
1190 
1191  Instruction::BinaryOps Opc = I.getOpcode();
1192 
1193  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1194  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1195 
1196  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1197  if (NumDivBits == -1)
1198  return nullptr;
1199 
1200  Value *Narrowed = nullptr;
1201  if (NumDivBits <= 24) {
1202  Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1203  IsDiv, IsSigned);
1204  } else if (NumDivBits <= 32) {
1205  Narrowed = expandDivRem32(Builder, I, Num, Den);
1206  }
1207 
1208  if (Narrowed) {
1209  return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1210  Builder.CreateZExt(Narrowed, Num->getType());
1211  }
1212 
1213  return nullptr;
1214 }
1215 
1216 void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
1217  Instruction::BinaryOps Opc = I.getOpcode();
1218  // Do the general expansion.
1219  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1221  return;
1222  }
1223 
1224  if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1226  return;
1227  }
1228 
1229  llvm_unreachable("not a division");
1230 }
1231 
1232 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
1233  if (foldBinOpIntoSelect(I))
1234  return true;
1235 
1236  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1237  DA->isUniform(&I) && promoteUniformOpToI32(I))
1238  return true;
1239 
1240  if (UseMul24Intrin && replaceMulWithMul24(I))
1241  return true;
1242 
1243  bool Changed = false;
1244  Instruction::BinaryOps Opc = I.getOpcode();
1245  Type *Ty = I.getType();
1246  Value *NewDiv = nullptr;
1247  unsigned ScalarSize = Ty->getScalarSizeInBits();
1248 
1249  SmallVector<BinaryOperator *, 8> Div64ToExpand;
1250 
1251  if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1252  Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1253  ScalarSize <= 64 &&
1254  !DisableIDivExpand) {
1255  Value *Num = I.getOperand(0);
1256  Value *Den = I.getOperand(1);
1257  IRBuilder<> Builder(&I);
1258  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1259 
1260  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1261  NewDiv = UndefValue::get(VT);
1262 
1263  for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1264  Value *NumEltN = Builder.CreateExtractElement(Num, N);
1265  Value *DenEltN = Builder.CreateExtractElement(Den, N);
1266 
1267  Value *NewElt;
1268  if (ScalarSize <= 32) {
1269  NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1270  if (!NewElt)
1271  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1272  } else {
1273  // See if this 64-bit division can be shrunk to 32/24-bits before
1274  // producing the general expansion.
1275  NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1276  if (!NewElt) {
1277  // The general 64-bit expansion introduces control flow and doesn't
1278  // return the new value. Just insert a scalar copy and defer
1279  // expanding it.
1280  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1281  Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1282  }
1283  }
1284 
1285  NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1286  }
1287  } else {
1288  if (ScalarSize <= 32)
1289  NewDiv = expandDivRem32(Builder, I, Num, Den);
1290  else {
1291  NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1292  if (!NewDiv)
1293  Div64ToExpand.push_back(&I);
1294  }
1295  }
1296 
1297  if (NewDiv) {
1298  I.replaceAllUsesWith(NewDiv);
1299  I.eraseFromParent();
1300  Changed = true;
1301  }
1302  }
1303 
1304  if (ExpandDiv64InIR) {
1305  // TODO: We get much worse code in specially handled constant cases.
1306  for (BinaryOperator *Div : Div64ToExpand) {
1307  expandDivRem64(*Div);
1308  Changed = true;
1309  }
1310  }
1311 
1312  return Changed;
1313 }
1314 
1315 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
1316  if (!WidenLoads)
1317  return false;
1318 
1319  if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1320  I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1321  canWidenScalarExtLoad(I)) {
1322  IRBuilder<> Builder(&I);
1323  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1324 
1325  Type *I32Ty = Builder.getInt32Ty();
1326  Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
1327  Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
1328  LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
1329  WidenLoad->copyMetadata(I);
1330 
1331  // If we have range metadata, we need to convert the type, and not make
1332  // assumptions about the high bits.
1333  if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1334  ConstantInt *Lower =
1335  mdconst::extract<ConstantInt>(Range->getOperand(0));
1336 
1337  if (Lower->isNullValue()) {
1338  WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1339  } else {
1340  Metadata *LowAndHigh[] = {
1341  ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1342  // Don't make assumptions about the high bits.
1344  };
1345 
1346  WidenLoad->setMetadata(LLVMContext::MD_range,
1348  }
1349  }
1350 
1351  int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1352  Type *IntNTy = Builder.getIntNTy(TySize);
1353  Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1354  Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1355  I.replaceAllUsesWith(ValOrig);
1356  I.eraseFromParent();
1357  return true;
1358  }
1359 
1360  return false;
1361 }
1362 
1363 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
1364  bool Changed = false;
1365 
1366  if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1367  DA->isUniform(&I))
1368  Changed |= promoteUniformOpToI32(I);
1369 
1370  return Changed;
1371 }
1372 
1373 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1374  bool Changed = false;
1375 
1376  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1377  DA->isUniform(&I))
1378  Changed |= promoteUniformOpToI32(I);
1379 
1380  return Changed;
1381 }
1382 
1383 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
1384  switch (I.getIntrinsicID()) {
1385  case Intrinsic::bitreverse:
1386  return visitBitreverseIntrinsicInst(I);
1387  default:
1388  return false;
1389  }
1390 }
1391 
1392 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
1393  bool Changed = false;
1394 
1395  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1396  DA->isUniform(&I))
1397  Changed |= promoteUniformBitreverseToI32(I);
1398 
1399  return Changed;
1400 }
1401 
1402 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
1403  Mod = &M;
1404  DL = &Mod->getDataLayout();
1405  return false;
1406 }
1407 
1409  if (skipFunction(F))
1410  return false;
1411 
1412  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1413  if (!TPC)
1414  return false;
1415 
1416  const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
1417  ST = &TM.getSubtarget<GCNSubtarget>(F);
1418  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1419  DA = &getAnalysis<LegacyDivergenceAnalysis>();
1420 
1421  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1422  DT = DTWP ? &DTWP->getDomTree() : nullptr;
1423 
1424  HasUnsafeFPMath = hasUnsafeFPMath(F);
1425 
1426  AMDGPU::SIModeRegisterDefaults Mode(F);
1427  HasFP32Denormals = Mode.allFP32Denormals();
1428 
1429  bool MadeChange = false;
1430 
1431  Function::iterator NextBB;
1432  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
1433  BasicBlock *BB = &*FI;
1434  NextBB = std::next(FI);
1435 
1436  BasicBlock::iterator Next;
1437  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
1438  Next = std::next(I);
1439 
1440  MadeChange |= visit(*I);
1441 
1442  if (Next != E) { // Control flow changed
1443  BasicBlock *NextInstBB = Next->getParent();
1444  if (NextInstBB != BB) {
1445  BB = NextInstBB;
1446  E = BB->end();
1447  FE = F.end();
1448  }
1449  }
1450  }
1451  }
1452 
1453  return MadeChange;
1454 }
1455 
1456 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
1457  "AMDGPU IR optimizations", false, false)
1462 
1463 char AMDGPUCodeGenPrepare::ID = 0;
1464 
1466  return new AMDGPUCodeGenPrepare();
1467 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
AssumptionCache.h
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4637
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1418
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
T
llvm::Function
Definition: Function.h:60
llvm::Attribute
Definition: Attributes.h:65
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
getMul64
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:862
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:727
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:309
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::IRBuilder<>
ValueTracking.h
llvm::ComputeMaxSignificantBits
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
Definition: ValueTracking.cpp:423
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
extractValues
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
Definition: AMDGPUCodeGenPrepare.cpp:456
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:420
llvm::ComputeNumSignBits
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition: ValueTracking.cpp:415
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1773
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:841
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
ConstantFolding.h
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:140
llvm::Attribute::getValueAsBool
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:290
optimizeWithFDivFast
static Value * optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, bool HasDenormals, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:719
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1384
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:240
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1434
KnownBits.h
insertValues
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
Definition: AMDGPUCodeGenPrepare.cpp:468
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:366
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:186
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::KnownBits::isNonNegative
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:99
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUCodeGenPrepare.cpp:31
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:28
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1771
llvm::FastMathFlags::approxFunc
bool approxFunc() const
Definition: FMF.h:72
f
Itanium Name Demangler i e convert the string _Z1fv into f()". You can also use the CRTP base ManglingParser to perform some simple analysis on the mangled name
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::ConstantFoldBinaryOpOperands
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
Definition: ConstantFolding.cpp:1334
llvm::ConstantFoldCastOperand
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Definition: ConstantFolding.cpp:1402
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
getOpcode
static Optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:190
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:141
llvm::M68kBeads::DA
@ DA
Definition: M68kBaseInfo.h:59
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
optimizeWithRcp
static Value * optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, bool RcpIsAccurate, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:660
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:395
AMDGPU
Definition: AMDGPUReplaceLDSUseWithPointer.cpp:114
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:392
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::ConstantFP
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:257
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1777
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:928
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
IR
Statically lint checks LLVM IR
Definition: Lint.cpp:751
optimizations
AMDGPU IR optimizations
Definition: AMDGPUCodeGenPrepare.cpp:1460
llvm::KnownBits::isNegative
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:96
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
hasUnsafeFPMath
static bool hasUnsafeFPMath(const Function &F)
Definition: AMDGPUCodeGenPrepare.cpp:857
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:279
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::LegacyLegalizeActions::Lower
@ Lower
The operation itself must be expressed in terms of simpler actions on this target.
Definition: LegacyLegalizerInfo.h:58
llvm::SPIRV::Decoration::Alignment
@ Alignment
llvm::cl::opt< bool >
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:77
llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:486
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1173
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
promotedOpIsNUW
static bool promotedOpIsNUW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:297
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:362
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::SelectInst::getTrueValue
const Value * getTrueValue() const
Definition: Instructions.h:1772
TargetPassConfig.h
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:222
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::FPMathOperator
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:167
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1724
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::BinaryOperator
Definition: InstrTypes.h:188
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::expandDivisionUpTo64Bits
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
Definition: IntegerDivision.cpp:630
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:137
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
InstVisitor.h
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:529
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:78
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::FastMathFlags::setFast
void setFast(bool B=true)
Definition: FMF.h:98
llvm::Instruction::copyFastMathFlags
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
Definition: Instruction.cpp:244
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:429
llvm::BitsToFloat
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
Definition: MathExtras.h:670
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:173
I32
@ I32
Definition: DXILOpLowering.cpp:40
getMulHu
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:876
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::expandRemainderUpTo64Bits
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
Definition: IntegerDivision.cpp:533
llvm::AMDGPUISD::RCP
@ RCP
Definition: AMDGPUISelLowering.h:416
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:350
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:148
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::FPMathOperator::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:285
findSelectThroughCast
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
Definition: AMDGPUCodeGenPrepare.cpp:580
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:345
llvm::ConstantFP::get
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:975
llvm::Module::getContext
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:262
LowAndHigh
Metadata * LowAndHigh[]
Definition: NVVMIntrRange.cpp:68
llvm::Type::isDoubleTy
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:151
llvm::ARCCC::Z
@ Z
Definition: ARCInfo.h:41
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
IntegerDivision.h
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
promotedOpIsNSW
static bool promotedOpIsNSW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:283
llvm::createAMDGPUCodeGenPreparePass
FunctionPass * createAMDGPUCodeGenPreparePass()
Definition: AMDGPUCodeGenPrepare.cpp:1465
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:786
llvm::FPMathOperator::getFPAccuracy
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Definition: Instructions.cpp:2863
LegacyDivergenceAnalysis.h
WidenLoads
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:367
Dominators.h
getSign32
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
Definition: AMDGPUCodeGenPrepare.cpp:1050
N
#define N
llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:676
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::SmallVectorImpl< Value * >
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1461
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::isKnownToBeAPowerOfTwo
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
Definition: ValueTracking.cpp:327
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:378
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:405
Mod
Module * Mod
Definition: PassBuilderBindings.cpp:54
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
AMDGPUTargetMachine.h
getMul24
static Value * getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, unsigned Size, unsigned NumBits, bool IsSigned)
Definition: AMDGPUCodeGenPrepare.cpp:484
llvm::Function::iterator
BasicBlockListType::iterator iterator
Definition: Function.h:66
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare