LLVM  16.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
22 #include "llvm/IR/Dominators.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/InitializePasses.h"
27 #include "llvm/Pass.h"
28 #include "llvm/Support/KnownBits.h"
30 
31 #define DEBUG_TYPE "amdgpu-codegenprepare"
32 
33 using namespace llvm;
34 
35 namespace {
36 
38  "amdgpu-codegenprepare-widen-constant-loads",
39  cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
41  cl::init(false));
42 
43 static cl::opt<bool> Widen16BitOps(
44  "amdgpu-codegenprepare-widen-16-bit-ops",
45  cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
47  cl::init(true));
48 
49 static cl::opt<bool> UseMul24Intrin(
50  "amdgpu-codegenprepare-mul24",
51  cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
53  cl::init(true));
54 
55 // Legalize 64-bit division by using the generic IR expansion.
56 static cl::opt<bool> ExpandDiv64InIR(
57  "amdgpu-codegenprepare-expand-div64",
58  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
60  cl::init(false));
61 
62 // Leave all division operations as they are. This supersedes ExpandDiv64InIR
63 // and is used for testing the legalizer.
64 static cl::opt<bool> DisableIDivExpand(
65  "amdgpu-codegenprepare-disable-idiv-expansion",
66  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
68  cl::init(false));
69 
70 class AMDGPUCodeGenPrepare : public FunctionPass,
71  public InstVisitor<AMDGPUCodeGenPrepare, bool> {
72  const GCNSubtarget *ST = nullptr;
73  AssumptionCache *AC = nullptr;
74  DominatorTree *DT = nullptr;
75  LegacyDivergenceAnalysis *DA = nullptr;
76  Module *Mod = nullptr;
77  const DataLayout *DL = nullptr;
78  bool HasUnsafeFPMath = false;
79  bool HasFP32Denormals = false;
80 
81  /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
82  /// binary operation \p V.
83  ///
84  /// \returns Binary operation \p V.
85  /// \returns \p T's base element bit width.
86  unsigned getBaseElementBitWidth(const Type *T) const;
87 
88  /// \returns Equivalent 32 bit integer type for given type \p T. For example,
89  /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
90  /// is returned.
91  Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
92 
93  /// \returns True if binary operation \p I is a signed binary operation, false
94  /// otherwise.
95  bool isSigned(const BinaryOperator &I) const;
96 
97  /// \returns True if the condition of 'select' operation \p I comes from a
98  /// signed 'icmp' operation, false otherwise.
99  bool isSigned(const SelectInst &I) const;
100 
101  /// \returns True if type \p T needs to be promoted to 32 bit integer type,
102  /// false otherwise.
103  bool needsPromotionToI32(const Type *T) const;
104 
105  /// Promotes uniform binary operation \p I to equivalent 32 bit binary
106  /// operation.
107  ///
108  /// \details \p I's base element bit width must be greater than 1 and less
109  /// than or equal 16. Promotion is done by sign or zero extending operands to
110  /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
111  /// truncating the result of 32 bit binary operation back to \p I's original
112  /// type. Division operation is not promoted.
113  ///
114  /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
115  /// false otherwise.
116  bool promoteUniformOpToI32(BinaryOperator &I) const;
117 
118  /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
119  ///
120  /// \details \p I's base element bit width must be greater than 1 and less
121  /// than or equal 16. Promotion is done by sign or zero extending operands to
122  /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
123  ///
124  /// \returns True.
125  bool promoteUniformOpToI32(ICmpInst &I) const;
126 
127  /// Promotes uniform 'select' operation \p I to 32 bit 'select'
128  /// operation.
129  ///
130  /// \details \p I's base element bit width must be greater than 1 and less
131  /// than or equal 16. Promotion is done by sign or zero extending operands to
132  /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
133  /// result of 32 bit 'select' operation back to \p I's original type.
134  ///
135  /// \returns True.
136  bool promoteUniformOpToI32(SelectInst &I) const;
137 
138  /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
139  /// intrinsic.
140  ///
141  /// \details \p I's base element bit width must be greater than 1 and less
142  /// than or equal 16. Promotion is done by zero extending the operand to 32
143  /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
144  /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
145  /// shift amount is 32 minus \p I's base element bit width), and truncating
146  /// the result of the shift operation back to \p I's original type.
147  ///
148  /// \returns True.
149  bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
150 
151  /// \returns The minimum number of bits needed to store the value of \Op as an
152  /// unsigned integer. Truncating to this size and then zero-extending to
153  /// the original will not change the value.
154  unsigned numBitsUnsigned(Value *Op) const;
155 
156  /// \returns The minimum number of bits needed to store the value of \Op as a
157  /// signed integer. Truncating to this size and then sign-extending to
158  /// the original size will not change the value.
159  unsigned numBitsSigned(Value *Op) const;
160 
161  /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
162  /// SelectionDAG has an issue where an and asserting the bits are known
163  bool replaceMulWithMul24(BinaryOperator &I) const;
164 
165  /// Perform same function as equivalently named function in DAGCombiner. Since
166  /// we expand some divisions here, we need to perform this before obscuring.
167  bool foldBinOpIntoSelect(BinaryOperator &I) const;
168 
169  bool divHasSpecialOptimization(BinaryOperator &I,
170  Value *Num, Value *Den) const;
171  int getDivNumBits(BinaryOperator &I,
172  Value *Num, Value *Den,
173  unsigned AtLeast, bool Signed) const;
174 
175  /// Expands 24 bit div or rem.
176  Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
177  Value *Num, Value *Den,
178  bool IsDiv, bool IsSigned) const;
179 
180  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
181  Value *Num, Value *Den, unsigned NumBits,
182  bool IsDiv, bool IsSigned) const;
183 
184  /// Expands 32 bit div or rem.
185  Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
186  Value *Num, Value *Den) const;
187 
188  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
189  Value *Num, Value *Den) const;
190  void expandDivRem64(BinaryOperator &I) const;
191 
192  /// Widen a scalar load.
193  ///
194  /// \details \p Widen scalar load for uniform, small type loads from constant
195  // memory / to a full 32-bits and then truncate the input to allow a scalar
196  // load instead of a vector load.
197  //
198  /// \returns True.
199 
200  bool canWidenScalarExtLoad(LoadInst &I) const;
201 
202 public:
203  static char ID;
204 
205  AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
206 
207  bool visitFDiv(BinaryOperator &I);
208  bool visitXor(BinaryOperator &I);
209 
210  bool visitInstruction(Instruction &I) { return false; }
211  bool visitBinaryOperator(BinaryOperator &I);
212  bool visitLoadInst(LoadInst &I);
213  bool visitICmpInst(ICmpInst &I);
214  bool visitSelectInst(SelectInst &I);
215 
216  bool visitIntrinsicInst(IntrinsicInst &I);
217  bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
218 
219  bool doInitialization(Module &M) override;
220  bool runOnFunction(Function &F) override;
221 
222  StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
223 
224  void getAnalysisUsage(AnalysisUsage &AU) const override {
227 
228  // FIXME: Division expansion needs to preserve the dominator tree.
229  if (!ExpandDiv64InIR)
230  AU.setPreservesAll();
231  }
232 };
233 
234 } // end anonymous namespace
235 
236 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
237  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
238 
239  if (T->isIntegerTy())
240  return T->getIntegerBitWidth();
241  return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
242 }
243 
244 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
245  assert(needsPromotionToI32(T) && "T does not need promotion to i32");
246 
247  if (T->isIntegerTy())
248  return B.getInt32Ty();
249  return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
250 }
251 
253  return I.getOpcode() == Instruction::AShr ||
254  I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
255 }
256 
257 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
258  return isa<ICmpInst>(I.getOperand(0)) ?
259  cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
260 }
261 
262 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
263  if (!Widen16BitOps)
264  return false;
265 
266  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
267  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
268  return true;
269 
270  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
271  // TODO: The set of packed operations is more limited, so may want to
272  // promote some anyway.
273  if (ST->hasVOP3PInsts())
274  return false;
275 
276  return needsPromotionToI32(VT->getElementType());
277  }
278 
279  return false;
280 }
281 
282 // Return true if the op promoted to i32 should have nsw set.
283 static bool promotedOpIsNSW(const Instruction &I) {
284  switch (I.getOpcode()) {
285  case Instruction::Shl:
286  case Instruction::Add:
287  case Instruction::Sub:
288  return true;
289  case Instruction::Mul:
290  return I.hasNoUnsignedWrap();
291  default:
292  return false;
293  }
294 }
295 
296 // Return true if the op promoted to i32 should have nuw set.
297 static bool promotedOpIsNUW(const Instruction &I) {
298  switch (I.getOpcode()) {
299  case Instruction::Shl:
300  case Instruction::Add:
301  case Instruction::Mul:
302  return true;
303  case Instruction::Sub:
304  return I.hasNoUnsignedWrap();
305  default:
306  return false;
307  }
308 }
309 
310 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
311  Type *Ty = I.getType();
312  const DataLayout &DL = Mod->getDataLayout();
313  int TySize = DL.getTypeSizeInBits(Ty);
314  Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
315 
316  return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
317 }
318 
319 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
320  assert(needsPromotionToI32(I.getType()) &&
321  "I does not need promotion to i32");
322 
323  if (I.getOpcode() == Instruction::SDiv ||
324  I.getOpcode() == Instruction::UDiv ||
325  I.getOpcode() == Instruction::SRem ||
326  I.getOpcode() == Instruction::URem)
327  return false;
328 
330  Builder.SetCurrentDebugLocation(I.getDebugLoc());
331 
332  Type *I32Ty = getI32Ty(Builder, I.getType());
333  Value *ExtOp0 = nullptr;
334  Value *ExtOp1 = nullptr;
335  Value *ExtRes = nullptr;
336  Value *TruncRes = nullptr;
337 
338  if (isSigned(I)) {
339  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
340  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
341  } else {
342  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
343  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
344  }
345 
346  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
347  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
348  if (promotedOpIsNSW(cast<Instruction>(I)))
349  Inst->setHasNoSignedWrap();
350 
351  if (promotedOpIsNUW(cast<Instruction>(I)))
352  Inst->setHasNoUnsignedWrap();
353 
354  if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
355  Inst->setIsExact(ExactOp->isExact());
356  }
357 
358  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
359 
360  I.replaceAllUsesWith(TruncRes);
361  I.eraseFromParent();
362 
363  return true;
364 }
365 
366 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
367  assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
368  "I does not need promotion to i32");
369 
371  Builder.SetCurrentDebugLocation(I.getDebugLoc());
372 
373  Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
374  Value *ExtOp0 = nullptr;
375  Value *ExtOp1 = nullptr;
376  Value *NewICmp = nullptr;
377 
378  if (I.isSigned()) {
379  ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
380  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
381  } else {
382  ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
383  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
384  }
385  NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
386 
387  I.replaceAllUsesWith(NewICmp);
388  I.eraseFromParent();
389 
390  return true;
391 }
392 
393 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
394  assert(needsPromotionToI32(I.getType()) &&
395  "I does not need promotion to i32");
396 
398  Builder.SetCurrentDebugLocation(I.getDebugLoc());
399 
400  Type *I32Ty = getI32Ty(Builder, I.getType());
401  Value *ExtOp1 = nullptr;
402  Value *ExtOp2 = nullptr;
403  Value *ExtRes = nullptr;
404  Value *TruncRes = nullptr;
405 
406  if (isSigned(I)) {
407  ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
408  ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
409  } else {
410  ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
411  ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
412  }
413  ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
414  TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
415 
416  I.replaceAllUsesWith(TruncRes);
417  I.eraseFromParent();
418 
419  return true;
420 }
421 
422 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
423  IntrinsicInst &I) const {
424  assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
425  "I must be bitreverse intrinsic");
426  assert(needsPromotionToI32(I.getType()) &&
427  "I does not need promotion to i32");
428 
430  Builder.SetCurrentDebugLocation(I.getDebugLoc());
431 
432  Type *I32Ty = getI32Ty(Builder, I.getType());
433  Function *I32 =
434  Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
435  Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
436  Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
437  Value *LShrOp =
438  Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
439  Value *TruncRes =
440  Builder.CreateTrunc(LShrOp, I.getType());
441 
442  I.replaceAllUsesWith(TruncRes);
443  I.eraseFromParent();
444 
445  return true;
446 }
447 
448 unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const {
449  return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
450 }
451 
452 unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const {
453  return ComputeMaxSignificantBits(Op, *DL, 0, AC);
454 }
455 
457  SmallVectorImpl<Value *> &Values, Value *V) {
458  auto *VT = dyn_cast<FixedVectorType>(V->getType());
459  if (!VT) {
460  Values.push_back(V);
461  return;
462  }
463 
464  for (int I = 0, E = VT->getNumElements(); I != E; ++I)
465  Values.push_back(Builder.CreateExtractElement(V, I));
466 }
467 
469  Type *Ty,
470  SmallVectorImpl<Value *> &Values) {
471  if (!Ty->isVectorTy()) {
472  assert(Values.size() == 1);
473  return Values[0];
474  }
475 
476  Value *NewVal = PoisonValue::get(Ty);
477  for (int I = 0, E = Values.size(); I != E; ++I)
478  NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
479 
480  return NewVal;
481 }
482 
483 // Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
484 // `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
485 // width of the original destination.
487  unsigned Size, unsigned NumBits, bool IsSigned) {
488  if (Size <= 32 || NumBits <= 32) {
489  Intrinsic::ID ID =
490  IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
491  return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
492  }
493 
494  assert(NumBits <= 48);
495 
496  Intrinsic::ID LoID =
497  IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
498  Intrinsic::ID HiID =
499  IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
500 
501  Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
502  Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
503 
504  IntegerType *I64Ty = Builder.getInt64Ty();
505  Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
506  Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
507 
508  return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
509 }
510 
511 bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
512  if (I.getOpcode() != Instruction::Mul)
513  return false;
514 
515  Type *Ty = I.getType();
516  unsigned Size = Ty->getScalarSizeInBits();
517  if (Size <= 16 && ST->has16BitInsts())
518  return false;
519 
520  // Prefer scalar if this could be s_mul_i32
521  if (DA->isUniform(&I))
522  return false;
523 
524  Value *LHS = I.getOperand(0);
525  Value *RHS = I.getOperand(1);
527  Builder.SetCurrentDebugLocation(I.getDebugLoc());
528 
529  unsigned LHSBits = 0, RHSBits = 0;
530  bool IsSigned = false;
531 
532  if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
533  (RHSBits = numBitsUnsigned(RHS)) <= 24) {
534  IsSigned = false;
535 
536  } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
537  (RHSBits = numBitsSigned(RHS)) <= 24) {
538  IsSigned = true;
539 
540  } else
541  return false;
542 
543  SmallVector<Value *, 4> LHSVals;
544  SmallVector<Value *, 4> RHSVals;
545  SmallVector<Value *, 4> ResultVals;
546  extractValues(Builder, LHSVals, LHS);
547  extractValues(Builder, RHSVals, RHS);
548 
549  IntegerType *I32Ty = Builder.getInt32Ty();
550  for (int I = 0, E = LHSVals.size(); I != E; ++I) {
551  Value *LHS, *RHS;
552  if (IsSigned) {
553  LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
554  RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
555  } else {
556  LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
557  RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
558  }
559 
560  Value *Result =
561  getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
562 
563  if (IsSigned) {
564  ResultVals.push_back(
565  Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
566  } else {
567  ResultVals.push_back(
568  Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
569  }
570  }
571 
572  Value *NewVal = insertValues(Builder, Ty, ResultVals);
573  NewVal->takeName(&I);
574  I.replaceAllUsesWith(NewVal);
575  I.eraseFromParent();
576 
577  return true;
578 }
579 
580 // Find a select instruction, which may have been casted. This is mostly to deal
581 // with cases where i16 selects were promoted here to i32.
583  Cast = nullptr;
584  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
585  return Sel;
586 
587  if ((Cast = dyn_cast<CastInst>(V))) {
588  if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
589  return Sel;
590  }
591 
592  return nullptr;
593 }
594 
595 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
596  // Don't do this unless the old select is going away. We want to eliminate the
597  // binary operator, not replace a binop with a select.
598  int SelOpNo = 0;
599 
600  CastInst *CastOp;
601 
602  // TODO: Should probably try to handle some cases with multiple
603  // users. Duplicating the select may be profitable for division.
604  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
605  if (!Sel || !Sel->hasOneUse()) {
606  SelOpNo = 1;
607  Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
608  }
609 
610  if (!Sel || !Sel->hasOneUse())
611  return false;
612 
613  Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
614  Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
615  Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
616  if (!CBO || !CT || !CF)
617  return false;
618 
619  if (CastOp) {
620  if (!CastOp->hasOneUse())
621  return false;
622  CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
623  CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
624  }
625 
626  // TODO: Handle special 0/-1 cases DAG combine does, although we only really
627  // need to handle divisions here.
628  Constant *FoldedT = SelOpNo ?
629  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
630  ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
631  if (!FoldedT || isa<ConstantExpr>(FoldedT))
632  return false;
633 
634  Constant *FoldedF = SelOpNo ?
635  ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
636  ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
637  if (!FoldedF || isa<ConstantExpr>(FoldedF))
638  return false;
639 
640  IRBuilder<> Builder(&BO);
641  Builder.SetCurrentDebugLocation(BO.getDebugLoc());
642  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
643  Builder.setFastMathFlags(FPOp->getFastMathFlags());
644 
645  Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
646  FoldedT, FoldedF);
647  NewSelect->takeName(&BO);
648  BO.replaceAllUsesWith(NewSelect);
649  BO.eraseFromParent();
650  if (CastOp)
651  CastOp->eraseFromParent();
652  Sel->eraseFromParent();
653  return true;
654 }
655 
656 // Optimize fdiv with rcp:
657 //
658 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
659 // allowed with unsafe-fp-math or afn.
660 //
661 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
662 static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
663  bool RcpIsAccurate, IRBuilder<> &Builder,
664  Module *Mod) {
665 
666  if (!AllowInaccurateRcp && !RcpIsAccurate)
667  return nullptr;
668 
669  Type *Ty = Den->getType();
670  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
671  if (AllowInaccurateRcp || RcpIsAccurate) {
672  if (CLHS->isExactlyValue(1.0)) {
674  Mod, Intrinsic::amdgcn_rcp, Ty);
675 
676  // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
677  // the CI documentation has a worst case error of 1 ulp.
678  // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
679  // use it as long as we aren't trying to use denormals.
680  //
681  // v_rcp_f16 and v_rsq_f16 DO support denormals.
682 
683  // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
684  // insert rsq intrinsic here.
685 
686  // 1.0 / x -> rcp(x)
687  return Builder.CreateCall(Decl, { Den });
688  }
689 
690  // Same as for 1.0, but expand the sign out of the constant.
691  if (CLHS->isExactlyValue(-1.0)) {
693  Mod, Intrinsic::amdgcn_rcp, Ty);
694 
695  // -1.0 / x -> rcp (fneg x)
696  Value *FNeg = Builder.CreateFNeg(Den);
697  return Builder.CreateCall(Decl, { FNeg });
698  }
699  }
700  }
701 
702  if (AllowInaccurateRcp) {
704  Mod, Intrinsic::amdgcn_rcp, Ty);
705 
706  // Turn into multiply by the reciprocal.
707  // x / y -> x * (1.0 / y)
708  Value *Recip = Builder.CreateCall(Decl, { Den });
709  return Builder.CreateFMul(Num, Recip);
710  }
711  return nullptr;
712 }
713 
714 // optimize with fdiv.fast:
715 //
716 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
717 //
718 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
719 //
720 // NOTE: optimizeWithRcp should be tried first because rcp is the preference.
721 static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
722  bool HasDenormals, IRBuilder<> &Builder,
723  Module *Mod) {
724  // fdiv.fast can achieve 2.5 ULP accuracy.
725  if (ReqdAccuracy < 2.5f)
726  return nullptr;
727 
728  // Only have fdiv.fast for f32.
729  Type *Ty = Den->getType();
730  if (!Ty->isFloatTy())
731  return nullptr;
732 
733  bool NumIsOne = false;
734  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
735  if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
736  NumIsOne = true;
737  }
738 
739  // fdiv does not support denormals. But 1.0/x is always fine to use it.
740  if (HasDenormals && !NumIsOne)
741  return nullptr;
742 
743  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
744  return Builder.CreateCall(Decl, { Num, Den });
745 }
746 
747 // Optimizations is performed based on fpmath, fast math flags as well as
748 // denormals to optimize fdiv with either rcp or fdiv.fast.
749 //
750 // With rcp:
751 // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
752 // allowed with unsafe-fp-math or afn.
753 //
754 // a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
755 //
756 // With fdiv.fast:
757 // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
758 //
759 // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
760 //
761 // NOTE: rcp is the preference in cases that both are legal.
762 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
763 
764  Type *Ty = FDiv.getType()->getScalarType();
765 
766  // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
767  // expansion around them in codegen.
768  if (Ty->isDoubleTy())
769  return false;
770 
771  // No intrinsic for fdiv16 if target does not support f16.
772  if (Ty->isHalfTy() && !ST->has16BitInsts())
773  return false;
774 
775  const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
776  const float ReqdAccuracy = FPOp->getFPAccuracy();
777 
778  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
779  FastMathFlags FMF = FPOp->getFastMathFlags();
780  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
781 
782  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
783  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
784  // rcp_f64 is never accurate.
785  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
786  (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
787 
788  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
789  Builder.setFastMathFlags(FMF);
790  Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
791 
792  Value *Num = FDiv.getOperand(0);
793  Value *Den = FDiv.getOperand(1);
794 
795  Value *NewFDiv = nullptr;
796  if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
797  NewFDiv = PoisonValue::get(VT);
798 
799  // FIXME: Doesn't do the right thing for cases where the vector is partially
800  // constant. This works when the scalarizer pass is run first.
801  for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
802  Value *NumEltI = Builder.CreateExtractElement(Num, I);
803  Value *DenEltI = Builder.CreateExtractElement(Den, I);
804  // Try rcp first.
805  Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
806  RcpIsAccurate, Builder, Mod);
807  if (!NewElt) // Try fdiv.fast.
808  NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
809  HasFP32Denormals, Builder, Mod);
810  if (!NewElt) // Keep the original.
811  NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
812 
813  NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
814  }
815  } else { // Scalar FDiv.
816  // Try rcp first.
817  NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
818  Builder, Mod);
819  if (!NewFDiv) { // Try fdiv.fast.
820  NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
821  Builder, Mod);
822  }
823  }
824 
825  if (NewFDiv) {
826  FDiv.replaceAllUsesWith(NewFDiv);
827  NewFDiv->takeName(&FDiv);
828  FDiv.eraseFromParent();
829  }
830 
831  return !!NewFDiv;
832 }
833 
834 bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
835  // Match the Xor instruction, its type and its operands
836  IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
837  ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
838  if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
839  return visitBinaryOperator(I);
840 
841  // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
842  // has only one use
843  if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
844  !IntrinsicCall->hasOneUse())
845  return visitBinaryOperator(I);
846 
847  // "Not" the second argument of the intrinsic call
848  ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
849  if (!Arg)
850  return visitBinaryOperator(I);
851 
852  IntrinsicCall->setOperand(
853  1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
854  I.replaceAllUsesWith(IntrinsicCall);
855  I.eraseFromParent();
856  return true;
857 }
858 
859 static bool hasUnsafeFPMath(const Function &F) {
860  Attribute Attr = F.getFnAttribute("unsafe-fp-math");
861  return Attr.getValueAsBool();
862 }
863 
864 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
865  Value *LHS, Value *RHS) {
866  Type *I32Ty = Builder.getInt32Ty();
867  Type *I64Ty = Builder.getInt64Ty();
868 
869  Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
870  Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
871  Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
872  Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
873  Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
874  Hi = Builder.CreateTrunc(Hi, I32Ty);
875  return std::make_pair(Lo, Hi);
876 }
877 
879  return getMul64(Builder, LHS, RHS).second;
880 }
881 
882 /// Figure out how many bits are really needed for this division. \p AtLeast is
883 /// an optimization hint to bypass the second ComputeNumSignBits call if we the
884 /// first one is insufficient. Returns -1 on failure.
885 int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
886  Value *Num, Value *Den,
887  unsigned AtLeast, bool IsSigned) const {
888  const DataLayout &DL = Mod->getDataLayout();
889  unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
890  if (LHSSignBits < AtLeast)
891  return -1;
892 
893  unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
894  if (RHSSignBits < AtLeast)
895  return -1;
896 
897  unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
898  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
899  if (IsSigned)
900  ++DivBits;
901  return DivBits;
902 }
903 
904 // The fractional part of a float is enough to accurately represent up to
905 // a 24-bit signed integer.
906 Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
907  BinaryOperator &I,
908  Value *Num, Value *Den,
909  bool IsDiv, bool IsSigned) const {
910  int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
911  if (DivBits == -1)
912  return nullptr;
913  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
914 }
915 
916 Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
917  BinaryOperator &I,
918  Value *Num, Value *Den,
919  unsigned DivBits,
920  bool IsDiv, bool IsSigned) const {
921  Type *I32Ty = Builder.getInt32Ty();
922  Num = Builder.CreateTrunc(Num, I32Ty);
923  Den = Builder.CreateTrunc(Den, I32Ty);
924 
925  Type *F32Ty = Builder.getFloatTy();
926  ConstantInt *One = Builder.getInt32(1);
927  Value *JQ = One;
928 
929  if (IsSigned) {
930  // char|short jq = ia ^ ib;
931  JQ = Builder.CreateXor(Num, Den);
932 
933  // jq = jq >> (bitsize - 2)
934  JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
935 
936  // jq = jq | 0x1
937  JQ = Builder.CreateOr(JQ, One);
938  }
939 
940  // int ia = (int)LHS;
941  Value *IA = Num;
942 
943  // int ib, (int)RHS;
944  Value *IB = Den;
945 
946  // float fa = (float)ia;
947  Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
948  : Builder.CreateUIToFP(IA, F32Ty);
949 
950  // float fb = (float)ib;
951  Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
952  : Builder.CreateUIToFP(IB,F32Ty);
953 
954  Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
955  Builder.getFloatTy());
956  Value *RCP = Builder.CreateCall(RcpDecl, { FB });
957  Value *FQM = Builder.CreateFMul(FA, RCP);
958 
959  // fq = trunc(fqm);
960  CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
961  FQ->copyFastMathFlags(Builder.getFastMathFlags());
962 
963  // float fqneg = -fq;
964  Value *FQNeg = Builder.CreateFNeg(FQ);
965 
966  // float fr = mad(fqneg, fb, fa);
967  auto FMAD = !ST->hasMadMacF32Insts()
968  ? Intrinsic::fma
969  : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
970  Value *FR = Builder.CreateIntrinsic(FMAD,
971  {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
972 
973  // int iq = (int)fq;
974  Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
975  : Builder.CreateFPToUI(FQ, I32Ty);
976 
977  // fr = fabs(fr);
978  FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
979 
980  // fb = fabs(fb);
981  FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
982 
983  // int cv = fr >= fb;
984  Value *CV = Builder.CreateFCmpOGE(FR, FB);
985 
986  // jq = (cv ? jq : 0);
987  JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
988 
989  // dst = iq + jq;
990  Value *Div = Builder.CreateAdd(IQ, JQ);
991 
992  Value *Res = Div;
993  if (!IsDiv) {
994  // Rem needs compensation, it's easier to recompute it
995  Value *Rem = Builder.CreateMul(Div, Den);
996  Res = Builder.CreateSub(Num, Rem);
997  }
998 
999  if (DivBits != 0 && DivBits < 32) {
1000  // Extend in register from the number of bits this divide really is.
1001  if (IsSigned) {
1002  int InRegBits = 32 - DivBits;
1003 
1004  Res = Builder.CreateShl(Res, InRegBits);
1005  Res = Builder.CreateAShr(Res, InRegBits);
1006  } else {
1007  ConstantInt *TruncMask
1008  = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1009  Res = Builder.CreateAnd(Res, TruncMask);
1010  }
1011  }
1012 
1013  return Res;
1014 }
1015 
1016 // Try to recognize special cases the DAG will emit special, better expansions
1017 // than the general expansion we do here.
1018 
1019 // TODO: It would be better to just directly handle those optimizations here.
1020 bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
1021  BinaryOperator &I, Value *Num, Value *Den) const {
1022  if (Constant *C = dyn_cast<Constant>(Den)) {
1023  // Arbitrary constants get a better expansion as long as a wider mulhi is
1024  // legal.
1025  if (C->getType()->getScalarSizeInBits() <= 32)
1026  return true;
1027 
1028  // TODO: Sdiv check for not exact for some reason.
1029 
1030  // If there's no wider mulhi, there's only a better expansion for powers of
1031  // two.
1032  // TODO: Should really know for each vector element.
1033  if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
1034  return true;
1035 
1036  return false;
1037  }
1038 
1039  if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1040  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1041  if (BinOpDen->getOpcode() == Instruction::Shl &&
1042  isa<Constant>(BinOpDen->getOperand(0)) &&
1043  isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
1044  0, AC, &I, DT)) {
1045  return true;
1046  }
1047  }
1048 
1049  return false;
1050 }
1051 
1053  // Check whether the sign can be determined statically.
1054  KnownBits Known = computeKnownBits(V, *DL);
1055  if (Known.isNegative())
1056  return Constant::getAllOnesValue(V->getType());
1057  if (Known.isNonNegative())
1058  return Constant::getNullValue(V->getType());
1059  return Builder.CreateAShr(V, Builder.getInt32(31));
1060 }
1061 
1062 Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
1063  BinaryOperator &I, Value *X,
1064  Value *Y) const {
1065  Instruction::BinaryOps Opc = I.getOpcode();
1066  assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1067  Opc == Instruction::SRem || Opc == Instruction::SDiv);
1068 
1069  FastMathFlags FMF;
1070  FMF.setFast();
1071  Builder.setFastMathFlags(FMF);
1072 
1073  if (divHasSpecialOptimization(I, X, Y))
1074  return nullptr; // Keep it for later optimization.
1075 
1076  bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1077  bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1078 
1079  Type *Ty = X->getType();
1080  Type *I32Ty = Builder.getInt32Ty();
1081  Type *F32Ty = Builder.getFloatTy();
1082 
1083  if (Ty->getScalarSizeInBits() < 32) {
1084  if (IsSigned) {
1085  X = Builder.CreateSExt(X, I32Ty);
1086  Y = Builder.CreateSExt(Y, I32Ty);
1087  } else {
1088  X = Builder.CreateZExt(X, I32Ty);
1089  Y = Builder.CreateZExt(Y, I32Ty);
1090  }
1091  }
1092 
1093  if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1094  return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1095  Builder.CreateZExtOrTrunc(Res, Ty);
1096  }
1097 
1098  ConstantInt *Zero = Builder.getInt32(0);
1099  ConstantInt *One = Builder.getInt32(1);
1100 
1101  Value *Sign = nullptr;
1102  if (IsSigned) {
1103  Value *SignX = getSign32(X, Builder, DL);
1104  Value *SignY = getSign32(Y, Builder, DL);
1105  // Remainder sign is the same as LHS
1106  Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1107 
1108  X = Builder.CreateAdd(X, SignX);
1109  Y = Builder.CreateAdd(Y, SignY);
1110 
1111  X = Builder.CreateXor(X, SignX);
1112  Y = Builder.CreateXor(Y, SignY);
1113  }
1114 
1115  // The algorithm here is based on ideas from "Software Integer Division", Tom
1116  // Rodeheffer, August 2008.
1117  //
1118  // unsigned udiv(unsigned x, unsigned y) {
1119  // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1120  // // that this is a lower bound on inv(y), even if some of the calculations
1121  // // round up.
1122  // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1123  //
1124  // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1125  // // Empirically this is guaranteed to give a "two-y" lower bound on
1126  // // inv(y).
1127  // z += umulh(z, -y * z);
1128  //
1129  // // Quotient/remainder estimate.
1130  // unsigned q = umulh(x, z);
1131  // unsigned r = x - q * y;
1132  //
1133  // // Two rounds of quotient/remainder refinement.
1134  // if (r >= y) {
1135  // ++q;
1136  // r -= y;
1137  // }
1138  // if (r >= y) {
1139  // ++q;
1140  // r -= y;
1141  // }
1142  //
1143  // return q;
1144  // }
1145 
1146  // Initial estimate of inv(y).
1147  Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1148  Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
1149  Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
1150  Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
1151  Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1152  Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1153 
1154  // One round of UNR.
1155  Value *NegY = Builder.CreateSub(Zero, Y);
1156  Value *NegYZ = Builder.CreateMul(NegY, Z);
1157  Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1158 
1159  // Quotient/remainder estimate.
1160  Value *Q = getMulHu(Builder, X, Z);
1161  Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1162 
1163  // First quotient/remainder refinement.
1164  Value *Cond = Builder.CreateICmpUGE(R, Y);
1165  if (IsDiv)
1166  Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1167  R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1168 
1169  // Second quotient/remainder refinement.
1170  Cond = Builder.CreateICmpUGE(R, Y);
1171  Value *Res;
1172  if (IsDiv)
1173  Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1174  else
1175  Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1176 
1177  if (IsSigned) {
1178  Res = Builder.CreateXor(Res, Sign);
1179  Res = Builder.CreateSub(Res, Sign);
1180  }
1181 
1182  Res = Builder.CreateTrunc(Res, Ty);
1183 
1184  return Res;
1185 }
1186 
1187 Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
1188  BinaryOperator &I,
1189  Value *Num, Value *Den) const {
1190  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1191  return nullptr; // Keep it for later optimization.
1192 
1193  Instruction::BinaryOps Opc = I.getOpcode();
1194 
1195  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1196  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1197 
1198  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1199  if (NumDivBits == -1)
1200  return nullptr;
1201 
1202  Value *Narrowed = nullptr;
1203  if (NumDivBits <= 24) {
1204  Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1205  IsDiv, IsSigned);
1206  } else if (NumDivBits <= 32) {
1207  Narrowed = expandDivRem32(Builder, I, Num, Den);
1208  }
1209 
1210  if (Narrowed) {
1211  return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1212  Builder.CreateZExt(Narrowed, Num->getType());
1213  }
1214 
1215  return nullptr;
1216 }
1217 
1218 void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
1219  Instruction::BinaryOps Opc = I.getOpcode();
1220  // Do the general expansion.
1221  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1223  return;
1224  }
1225 
1226  if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1228  return;
1229  }
1230 
1231  llvm_unreachable("not a division");
1232 }
1233 
1234 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
1235  if (foldBinOpIntoSelect(I))
1236  return true;
1237 
1238  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1239  DA->isUniform(&I) && promoteUniformOpToI32(I))
1240  return true;
1241 
1242  if (UseMul24Intrin && replaceMulWithMul24(I))
1243  return true;
1244 
1245  bool Changed = false;
1246  Instruction::BinaryOps Opc = I.getOpcode();
1247  Type *Ty = I.getType();
1248  Value *NewDiv = nullptr;
1249  unsigned ScalarSize = Ty->getScalarSizeInBits();
1250 
1251  SmallVector<BinaryOperator *, 8> Div64ToExpand;
1252 
1253  if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1254  Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1255  ScalarSize <= 64 &&
1256  !DisableIDivExpand) {
1257  Value *Num = I.getOperand(0);
1258  Value *Den = I.getOperand(1);
1259  IRBuilder<> Builder(&I);
1260  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1261 
1262  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1263  NewDiv = PoisonValue::get(VT);
1264 
1265  for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1266  Value *NumEltN = Builder.CreateExtractElement(Num, N);
1267  Value *DenEltN = Builder.CreateExtractElement(Den, N);
1268 
1269  Value *NewElt;
1270  if (ScalarSize <= 32) {
1271  NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1272  if (!NewElt)
1273  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1274  } else {
1275  // See if this 64-bit division can be shrunk to 32/24-bits before
1276  // producing the general expansion.
1277  NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1278  if (!NewElt) {
1279  // The general 64-bit expansion introduces control flow and doesn't
1280  // return the new value. Just insert a scalar copy and defer
1281  // expanding it.
1282  NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1283  Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
1284  }
1285  }
1286 
1287  NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1288  }
1289  } else {
1290  if (ScalarSize <= 32)
1291  NewDiv = expandDivRem32(Builder, I, Num, Den);
1292  else {
1293  NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1294  if (!NewDiv)
1295  Div64ToExpand.push_back(&I);
1296  }
1297  }
1298 
1299  if (NewDiv) {
1300  I.replaceAllUsesWith(NewDiv);
1301  I.eraseFromParent();
1302  Changed = true;
1303  }
1304  }
1305 
1306  if (ExpandDiv64InIR) {
1307  // TODO: We get much worse code in specially handled constant cases.
1308  for (BinaryOperator *Div : Div64ToExpand) {
1309  expandDivRem64(*Div);
1310  Changed = true;
1311  }
1312  }
1313 
1314  return Changed;
1315 }
1316 
1317 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
1318  if (!WidenLoads)
1319  return false;
1320 
1321  if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1322  I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1323  canWidenScalarExtLoad(I)) {
1324  IRBuilder<> Builder(&I);
1325  Builder.SetCurrentDebugLocation(I.getDebugLoc());
1326 
1327  Type *I32Ty = Builder.getInt32Ty();
1328  Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
1329  Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
1330  LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
1331  WidenLoad->copyMetadata(I);
1332 
1333  // If we have range metadata, we need to convert the type, and not make
1334  // assumptions about the high bits.
1335  if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1336  ConstantInt *Lower =
1337  mdconst::extract<ConstantInt>(Range->getOperand(0));
1338 
1339  if (Lower->isNullValue()) {
1340  WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1341  } else {
1342  Metadata *LowAndHigh[] = {
1343  ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1344  // Don't make assumptions about the high bits.
1346  };
1347 
1348  WidenLoad->setMetadata(LLVMContext::MD_range,
1350  }
1351  }
1352 
1353  int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
1354  Type *IntNTy = Builder.getIntNTy(TySize);
1355  Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1356  Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1357  I.replaceAllUsesWith(ValOrig);
1358  I.eraseFromParent();
1359  return true;
1360  }
1361 
1362  return false;
1363 }
1364 
1365 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
1366  bool Changed = false;
1367 
1368  if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
1369  DA->isUniform(&I))
1370  Changed |= promoteUniformOpToI32(I);
1371 
1372  return Changed;
1373 }
1374 
1375 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
1376  bool Changed = false;
1377 
1378  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1379  DA->isUniform(&I))
1380  Changed |= promoteUniformOpToI32(I);
1381 
1382  return Changed;
1383 }
1384 
1385 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
1386  switch (I.getIntrinsicID()) {
1387  case Intrinsic::bitreverse:
1388  return visitBitreverseIntrinsicInst(I);
1389  default:
1390  return false;
1391  }
1392 }
1393 
1394 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
1395  bool Changed = false;
1396 
1397  if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
1398  DA->isUniform(&I))
1399  Changed |= promoteUniformBitreverseToI32(I);
1400 
1401  return Changed;
1402 }
1403 
1404 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
1405  Mod = &M;
1406  DL = &Mod->getDataLayout();
1407  return false;
1408 }
1409 
1411  if (skipFunction(F))
1412  return false;
1413 
1414  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1415  if (!TPC)
1416  return false;
1417 
1418  const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
1419  ST = &TM.getSubtarget<GCNSubtarget>(F);
1420  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1421  DA = &getAnalysis<LegacyDivergenceAnalysis>();
1422 
1423  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1424  DT = DTWP ? &DTWP->getDomTree() : nullptr;
1425 
1426  HasUnsafeFPMath = hasUnsafeFPMath(F);
1427 
1428  AMDGPU::SIModeRegisterDefaults Mode(F);
1429  HasFP32Denormals = Mode.allFP32Denormals();
1430 
1431  bool MadeChange = false;
1432 
1433  Function::iterator NextBB;
1434  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
1435  BasicBlock *BB = &*FI;
1436  NextBB = std::next(FI);
1437 
1438  BasicBlock::iterator Next;
1439  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
1440  Next = std::next(I);
1441 
1442  MadeChange |= visit(*I);
1443 
1444  if (Next != E) { // Control flow changed
1445  BasicBlock *NextInstBB = Next->getParent();
1446  if (NextInstBB != BB) {
1447  BB = NextInstBB;
1448  E = BB->end();
1449  FE = F.end();
1450  }
1451  }
1452  }
1453  }
1454 
1455  return MadeChange;
1456 }
1457 
1458 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
1459  "AMDGPU IR optimizations", false, false)
1464 
1465 char AMDGPUCodeGenPrepare::ID = 0;
1466 
1468  return new AMDGPUCodeGenPrepare();
1469 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
AssumptionCache.h
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4715
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::wasm::ValType::I32
@ I32
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
llvm::BasicBlock::iterator
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:87
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:104
T
llvm::Function
Definition: Function.h:60
llvm::Attribute
Definition: Attributes.h:66
Pass.h
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
getMul64
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:864
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:727
llvm::Type::getScalarType
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:328
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::IRBuilder<>
ValueTracking.h
llvm::ComputeMaxSignificantBits
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
Definition: ValueTracking.cpp:393
llvm::DominatorTree
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
extractValues
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
Definition: AMDGPUCodeGenPrepare.cpp:456
llvm::ConstantAsMetadata::get
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:420
llvm::ComputeNumSignBits
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
Definition: ValueTracking.cpp:385
llvm::SelectInst::getFalseValue
const Value * getFalseValue() const
Definition: Instructions.h:1784
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:878
RHS
Value * RHS
Definition: X86PartialReduction.cpp:76
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
ConstantFolding.h
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:141
llvm::Attribute::getValueAsBool
bool getValueAsBool() const
Return the attribute's value as a boolean.
Definition: Attributes.cpp:298
optimizeWithFDivFast
static Value * optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, bool HasDenormals, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:721
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1400
F
#define F(x, y, z)
Definition: MD5.cpp:55
llvm::RISCVFenceField::R
@ R
Definition: RISCVBaseInfo.h:265
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1456
KnownBits.h
insertValues
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
Definition: AMDGPUCodeGenPrepare.cpp:468
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:55
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:187
LHS
Value * LHS
Definition: X86PartialReduction.cpp:75
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::KnownBits::isNonNegative
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:99
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUCodeGenPrepare.cpp:31
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:29
llvm::SelectInst::getCondition
const Value * getCondition() const
Definition: Instructions.h:1782
llvm::FastMathFlags::approxFunc
bool approxFunc() const
Definition: FMF.h:72
f
Itanium Name Demangler i e convert the string _Z1fv into f()". You can also use the CRTP base ManglingParser to perform some simple analysis on the mangled name
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::ConstantFoldBinaryOpOperands
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
Definition: ConstantFolding.cpp:1339
llvm::ConstantFoldCastOperand
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Definition: ConstantFolding.cpp:1411
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:246
false
Definition: StackSlotColoring.cpp:141
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::IntegerType
Class to represent integer types.
Definition: DerivedTypes.h:40
optimizeWithRcp
static Value * optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, bool RcpIsAccurate, IRBuilder<> &Builder, Module *Mod)
Definition: AMDGPUCodeGenPrepare.cpp:662
llvm::Constant::getAllOnesValue
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:395
AMDGPU
Definition: AMDGPUReplaceLDSUseWithPointer.cpp:114
llvm::BinaryOperator::getOpcode
BinaryOps getOpcode() const
Definition: InstrTypes.h:392
llvm::Instruction
Definition: Instruction.h:42
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:189
llvm::ConstantFP
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:257
llvm::ConstantInt::get
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:879
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
llvm::AArch64PACKey::IA
@ IA
Definition: AArch64BaseInfo.h:819
IR
Statically lint checks LLVM IR
Definition: Lint.cpp:746
optimizations
AMDGPU IR optimizations
Definition: AMDGPUCodeGenPrepare.cpp:1462
llvm::KnownBits::isNegative
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:96
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:684
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::Metadata
Root of the metadata hierarchy.
Definition: Metadata.h:62
hasUnsafeFPMath
static bool hasUnsafeFPMath(const Function &F)
Definition: AMDGPUCodeGenPrepare.cpp:859
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::Instruction::getMetadata
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:276
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
llvm::LegacyLegalizeActions::Lower
@ Lower
The operation itself must be expressed in terms of simpler actions on this target.
Definition: LegacyLegalizerInfo.h:58
llvm::cl::opt< bool >
llvm::AArch64PACKey::DA
@ DA
Definition: AArch64BaseInfo.h:821
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:375
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:81
llvm::ISD::FMAD
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:486
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1186
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
promotedOpIsNUW
static bool promotedOpIsNUW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:297
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AArch64PACKey::IB
@ IB
Definition: AArch64BaseInfo.h:820
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:447
llvm::SelectInst::getTrueValue
const Value * getTrueValue() const
Definition: Instructions.h:1783
TargetPassConfig.h
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:197
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::FPMathOperator
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition: Operator.h:167
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1735
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:264
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:66
getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:651
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::BinaryOperator
Definition: InstrTypes.h:188
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::expandDivisionUpTo64Bits
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
Definition: IntegerDivision.cpp:600
Mul
BinaryOperator * Mul
Definition: X86PartialReduction.cpp:70
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:138
llvm::logicalview::LVAttributeKind::Range
@ Range
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
AMDGPU.h
InstVisitor.h
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::logicalview::LVAttributeKind::Zero
@ Zero
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:78
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::FastMathFlags::setFast
void setFast(bool B=true)
Definition: FMF.h:98
llvm::Instruction::copyFastMathFlags
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
Definition: Instruction.cpp:281
llvm::CastInst
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:429
llvm::BitsToFloat
float BitsToFloat(uint32_t Bits)
This function takes a 32-bit integer and returns the bit equivalent float.
Definition: MathExtras.h:577
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1481
getMulHu
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
Definition: AMDGPUCodeGenPrepare.cpp:878
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:85
llvm::expandRemainderUpTo64Bits
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
Definition: IntegerDivision.cpp:505
llvm::AMDGPUISD::RCP
@ RCP
Definition: AMDGPUISelLowering.h:419
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:350
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::FPMathOperator::getFastMathFlags
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition: Operator.h:285
findSelectThroughCast
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
Definition: AMDGPUCodeGenPrepare.cpp:582
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::ConstantFP::get
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:926
llvm::Module::getContext
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:263
LowAndHigh
Metadata * LowAndHigh[]
Definition: NVVMIntrRange.cpp:68
llvm::Type::isDoubleTy
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
llvm::ARCCC::Z
@ Z
Definition: ARCInfo.h:41
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:185
IntegerDivision.h
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
promotedOpIsNSW
static bool promotedOpIsNSW(const Instruction &I)
Definition: AMDGPUCodeGenPrepare.cpp:283
llvm::createAMDGPUCodeGenPreparePass
FunctionPass * createAMDGPUCodeGenPreparePass()
Definition: AMDGPUCodeGenPrepare.cpp:1467
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:793
isSigned
static bool isSigned(unsigned int Opcode)
Definition: ExpandLargeDivRem.cpp:52
llvm::FPMathOperator::getFPAccuracy
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Definition: Instructions.cpp:3023
LegacyDivergenceAnalysis.h
WidenLoads
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:359
Dominators.h
getSign32
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
Definition: AMDGPUCodeGenPrepare.cpp:1052
N
#define N
llvm::CastInst::getOpcode
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition: InstrTypes.h:676
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:91
llvm::IntegerType::getBitWidth
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
llvm::SmallVectorImpl< Value * >
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:399
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1474
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::isKnownToBeAPowerOfTwo
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
Definition: ValueTracking.cpp:302
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::Value::takeName
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:413
Mod
Module * Mod
Definition: PassBuilderBindings.cpp:54
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
AMDGPUTargetMachine.h
getMul24
static Value * getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, unsigned Size, unsigned NumBits, bool IsSigned)
Definition: AMDGPUCodeGenPrepare.cpp:486
llvm::Function::iterator
BasicBlockListType::iterator iterator
Definition: Function.h:66
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:379
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:39
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare
llvm::PoisonValue::get
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1732