LLVM  15.0.0git
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "NVPTXUtilities.h"
11 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
17 #include "llvm/IR/IntrinsicsNVPTX.h"
18 #include "llvm/Support/Debug.h"
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "NVPTXtti"
22 
23 // Whether the given intrinsic reads threadIdx.x/y/z.
24 static bool readsThreadIndex(const IntrinsicInst *II) {
25  switch (II->getIntrinsicID()) {
26  default: return false;
27  case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28  case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29  case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30  return true;
31  }
32 }
33 
34 static bool readsLaneId(const IntrinsicInst *II) {
35  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36 }
37 
38 // Whether the given intrinsic is an atomic instruction in PTX.
39 static bool isNVVMAtomic(const IntrinsicInst *II) {
40  switch (II->getIntrinsicID()) {
41  default: return false;
42  case Intrinsic::nvvm_atomic_load_inc_32:
43  case Intrinsic::nvvm_atomic_load_dec_32:
44 
45  case Intrinsic::nvvm_atomic_add_gen_f_cta:
46  case Intrinsic::nvvm_atomic_add_gen_f_sys:
47  case Intrinsic::nvvm_atomic_add_gen_i_cta:
48  case Intrinsic::nvvm_atomic_add_gen_i_sys:
49  case Intrinsic::nvvm_atomic_and_gen_i_cta:
50  case Intrinsic::nvvm_atomic_and_gen_i_sys:
51  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
52  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
53  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
54  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
55  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
56  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
57  case Intrinsic::nvvm_atomic_max_gen_i_cta:
58  case Intrinsic::nvvm_atomic_max_gen_i_sys:
59  case Intrinsic::nvvm_atomic_min_gen_i_cta:
60  case Intrinsic::nvvm_atomic_min_gen_i_sys:
61  case Intrinsic::nvvm_atomic_or_gen_i_cta:
62  case Intrinsic::nvvm_atomic_or_gen_i_sys:
63  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
64  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
65  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
66  case Intrinsic::nvvm_atomic_xor_gen_i_sys:
67  return true;
68  }
69 }
70 
72  // Without inter-procedural analysis, we conservatively assume that arguments
73  // to __device__ functions are divergent.
74  if (const Argument *Arg = dyn_cast<Argument>(V))
75  return !isKernelFunction(*Arg->getParent());
76 
77  if (const Instruction *I = dyn_cast<Instruction>(V)) {
78  // Without pointer analysis, we conservatively assume values loaded from
79  // generic or local address space are divergent.
80  if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
81  unsigned AS = LI->getPointerAddressSpace();
82  return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
83  }
84  // Atomic instructions may cause divergence. Atomic instructions are
85  // executed sequentially across all threads in a warp. Therefore, an earlier
86  // executed thread may see different memory inputs than a later executed
87  // thread. For example, suppose *a = 0 initially.
88  //
89  // atom.global.add.s32 d, [a], 1
90  //
91  // returns 0 for the first thread that enters the critical region, and 1 for
92  // the second thread.
93  if (I->isAtomic())
94  return true;
95  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
96  // Instructions that read threadIdx are obviously divergent.
97  if (readsThreadIndex(II) || readsLaneId(II))
98  return true;
99  // Handle the NVPTX atomic intrinsics that cannot be represented as an
100  // atomic IR instruction.
101  if (isNVVMAtomic(II))
102  return true;
103  }
104  // Conservatively consider the return value of function calls as divergent.
105  // We could analyze callees with bodies more precisely using
106  // inter-procedural analysis.
107  if (isa<CallInst>(I))
108  return true;
109  }
110 
111  return false;
112 }
113 
114 // Convert NVVM intrinsics to target-generic LLVM code where possible.
116  // Each NVVM intrinsic we can simplify can be replaced with one of:
117  //
118  // * an LLVM intrinsic,
119  // * an LLVM cast operation,
120  // * an LLVM binary operation, or
121  // * ad-hoc LLVM IR for the particular operation.
122 
123  // Some transformations are only valid when the module's
124  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
125  // transformations are valid regardless of the module's ftz setting.
126  enum FtzRequirementTy {
127  FTZ_Any, // Any ftz setting is ok.
128  FTZ_MustBeOn, // Transformation is valid only if ftz is on.
129  FTZ_MustBeOff, // Transformation is valid only if ftz is off.
130  };
131  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
132  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
133  // simplify.
134  enum SpecialCase {
135  SPC_Reciprocal,
136  };
137 
138  // SimplifyAction is a poor-man's variant (plus an additional flag) that
139  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
140  struct SimplifyAction {
141  // Invariant: At most one of these Optionals has a value.
145  Optional<SpecialCase> Special;
146 
147  FtzRequirementTy FtzRequirement = FTZ_Any;
148  // Denormal handling is guarded by different attributes depending on the
149  // type (denormal-fp-math vs denormal-fp-math-f32), take note of halfs.
150  bool IsHalfTy = false;
151 
152  SimplifyAction() = default;
153 
154  SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq,
155  bool IsHalfTy = false)
156  : IID(IID), FtzRequirement(FtzReq), IsHalfTy(IsHalfTy) {}
157 
158  // Cast operations don't have anything to do with FTZ, so we skip that
159  // argument.
160  SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
161 
162  SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
163  : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
164 
165  SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
166  : Special(Special), FtzRequirement(FtzReq) {}
167  };
168 
169  // Try to generate a SimplifyAction describing how to replace our
170  // IntrinsicInstr with target-generic LLVM IR.
171  const SimplifyAction Action = [II]() -> SimplifyAction {
172  switch (II->getIntrinsicID()) {
173  // NVVM intrinsics that map directly to LLVM intrinsics.
174  case Intrinsic::nvvm_ceil_d:
175  return {Intrinsic::ceil, FTZ_Any};
176  case Intrinsic::nvvm_ceil_f:
177  return {Intrinsic::ceil, FTZ_MustBeOff};
178  case Intrinsic::nvvm_ceil_ftz_f:
179  return {Intrinsic::ceil, FTZ_MustBeOn};
180  case Intrinsic::nvvm_fabs_d:
181  return {Intrinsic::fabs, FTZ_Any};
182  case Intrinsic::nvvm_fabs_f:
183  return {Intrinsic::fabs, FTZ_MustBeOff};
184  case Intrinsic::nvvm_fabs_ftz_f:
185  return {Intrinsic::fabs, FTZ_MustBeOn};
186  case Intrinsic::nvvm_floor_d:
187  return {Intrinsic::floor, FTZ_Any};
188  case Intrinsic::nvvm_floor_f:
189  return {Intrinsic::floor, FTZ_MustBeOff};
190  case Intrinsic::nvvm_floor_ftz_f:
191  return {Intrinsic::floor, FTZ_MustBeOn};
192  case Intrinsic::nvvm_fma_rn_d:
193  return {Intrinsic::fma, FTZ_Any};
194  case Intrinsic::nvvm_fma_rn_f:
195  return {Intrinsic::fma, FTZ_MustBeOff};
196  case Intrinsic::nvvm_fma_rn_ftz_f:
197  return {Intrinsic::fma, FTZ_MustBeOn};
198  case Intrinsic::nvvm_fma_rn_f16:
199  return {Intrinsic::fma, FTZ_MustBeOff, true};
200  case Intrinsic::nvvm_fma_rn_ftz_f16:
201  return {Intrinsic::fma, FTZ_MustBeOn, true};
202  case Intrinsic::nvvm_fma_rn_f16x2:
203  return {Intrinsic::fma, FTZ_MustBeOff, true};
204  case Intrinsic::nvvm_fma_rn_ftz_f16x2:
205  return {Intrinsic::fma, FTZ_MustBeOn, true};
206  case Intrinsic::nvvm_fmax_d:
207  return {Intrinsic::maxnum, FTZ_Any};
208  case Intrinsic::nvvm_fmax_f:
209  return {Intrinsic::maxnum, FTZ_MustBeOff};
210  case Intrinsic::nvvm_fmax_ftz_f:
211  return {Intrinsic::maxnum, FTZ_MustBeOn};
212  case Intrinsic::nvvm_fmax_nan_f:
213  return {Intrinsic::maximum, FTZ_MustBeOff};
214  case Intrinsic::nvvm_fmax_ftz_nan_f:
215  return {Intrinsic::maximum, FTZ_MustBeOn};
216  case Intrinsic::nvvm_fmax_f16:
217  return {Intrinsic::maxnum, FTZ_MustBeOff, true};
218  case Intrinsic::nvvm_fmax_ftz_f16:
219  return {Intrinsic::maxnum, FTZ_MustBeOn, true};
220  case Intrinsic::nvvm_fmax_f16x2:
221  return {Intrinsic::maxnum, FTZ_MustBeOff, true};
222  case Intrinsic::nvvm_fmax_ftz_f16x2:
223  return {Intrinsic::maxnum, FTZ_MustBeOn, true};
224  case Intrinsic::nvvm_fmax_nan_f16:
225  return {Intrinsic::maximum, FTZ_MustBeOff, true};
226  case Intrinsic::nvvm_fmax_ftz_nan_f16:
227  return {Intrinsic::maximum, FTZ_MustBeOn, true};
228  case Intrinsic::nvvm_fmax_nan_f16x2:
229  return {Intrinsic::maximum, FTZ_MustBeOff, true};
230  case Intrinsic::nvvm_fmax_ftz_nan_f16x2:
231  return {Intrinsic::maximum, FTZ_MustBeOn, true};
232  case Intrinsic::nvvm_fmin_d:
233  return {Intrinsic::minnum, FTZ_Any};
234  case Intrinsic::nvvm_fmin_f:
235  return {Intrinsic::minnum, FTZ_MustBeOff};
236  case Intrinsic::nvvm_fmin_ftz_f:
237  return {Intrinsic::minnum, FTZ_MustBeOn};
238  case Intrinsic::nvvm_fmin_nan_f:
239  return {Intrinsic::minimum, FTZ_MustBeOff};
240  case Intrinsic::nvvm_fmin_ftz_nan_f:
241  return {Intrinsic::minimum, FTZ_MustBeOn};
242  case Intrinsic::nvvm_fmin_f16:
243  return {Intrinsic::minnum, FTZ_MustBeOff, true};
244  case Intrinsic::nvvm_fmin_ftz_f16:
245  return {Intrinsic::minnum, FTZ_MustBeOn, true};
246  case Intrinsic::nvvm_fmin_f16x2:
247  return {Intrinsic::minnum, FTZ_MustBeOff, true};
248  case Intrinsic::nvvm_fmin_ftz_f16x2:
249  return {Intrinsic::minnum, FTZ_MustBeOn, true};
250  case Intrinsic::nvvm_fmin_nan_f16:
251  return {Intrinsic::minimum, FTZ_MustBeOff, true};
252  case Intrinsic::nvvm_fmin_ftz_nan_f16:
253  return {Intrinsic::minimum, FTZ_MustBeOn, true};
254  case Intrinsic::nvvm_fmin_nan_f16x2:
255  return {Intrinsic::minimum, FTZ_MustBeOff, true};
256  case Intrinsic::nvvm_fmin_ftz_nan_f16x2:
257  return {Intrinsic::minimum, FTZ_MustBeOn, true};
258  case Intrinsic::nvvm_round_d:
259  return {Intrinsic::round, FTZ_Any};
260  case Intrinsic::nvvm_round_f:
261  return {Intrinsic::round, FTZ_MustBeOff};
262  case Intrinsic::nvvm_round_ftz_f:
263  return {Intrinsic::round, FTZ_MustBeOn};
264  case Intrinsic::nvvm_sqrt_rn_d:
265  return {Intrinsic::sqrt, FTZ_Any};
266  case Intrinsic::nvvm_sqrt_f:
267  // nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
268  // ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
269  // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
270  // the versions with explicit ftz-ness.
271  return {Intrinsic::sqrt, FTZ_Any};
272  case Intrinsic::nvvm_sqrt_rn_f:
273  return {Intrinsic::sqrt, FTZ_MustBeOff};
274  case Intrinsic::nvvm_sqrt_rn_ftz_f:
275  return {Intrinsic::sqrt, FTZ_MustBeOn};
276  case Intrinsic::nvvm_trunc_d:
277  return {Intrinsic::trunc, FTZ_Any};
278  case Intrinsic::nvvm_trunc_f:
279  return {Intrinsic::trunc, FTZ_MustBeOff};
280  case Intrinsic::nvvm_trunc_ftz_f:
281  return {Intrinsic::trunc, FTZ_MustBeOn};
282 
283  // NVVM intrinsics that map to LLVM cast operations.
284  //
285  // Note that llvm's target-generic conversion operators correspond to the rz
286  // (round to zero) versions of the nvvm conversion intrinsics, even though
287  // most everything else here uses the rn (round to nearest even) nvvm ops.
288  case Intrinsic::nvvm_d2i_rz:
289  case Intrinsic::nvvm_f2i_rz:
290  case Intrinsic::nvvm_d2ll_rz:
291  case Intrinsic::nvvm_f2ll_rz:
292  return {Instruction::FPToSI};
293  case Intrinsic::nvvm_d2ui_rz:
294  case Intrinsic::nvvm_f2ui_rz:
295  case Intrinsic::nvvm_d2ull_rz:
296  case Intrinsic::nvvm_f2ull_rz:
297  return {Instruction::FPToUI};
298  case Intrinsic::nvvm_i2d_rz:
299  case Intrinsic::nvvm_i2f_rz:
300  case Intrinsic::nvvm_ll2d_rz:
301  case Intrinsic::nvvm_ll2f_rz:
302  return {Instruction::SIToFP};
303  case Intrinsic::nvvm_ui2d_rz:
304  case Intrinsic::nvvm_ui2f_rz:
305  case Intrinsic::nvvm_ull2d_rz:
306  case Intrinsic::nvvm_ull2f_rz:
307  return {Instruction::UIToFP};
308 
309  // NVVM intrinsics that map to LLVM binary ops.
310  case Intrinsic::nvvm_add_rn_d:
311  return {Instruction::FAdd, FTZ_Any};
312  case Intrinsic::nvvm_add_rn_f:
313  return {Instruction::FAdd, FTZ_MustBeOff};
314  case Intrinsic::nvvm_add_rn_ftz_f:
315  return {Instruction::FAdd, FTZ_MustBeOn};
316  case Intrinsic::nvvm_mul_rn_d:
317  return {Instruction::FMul, FTZ_Any};
318  case Intrinsic::nvvm_mul_rn_f:
319  return {Instruction::FMul, FTZ_MustBeOff};
320  case Intrinsic::nvvm_mul_rn_ftz_f:
321  return {Instruction::FMul, FTZ_MustBeOn};
322  case Intrinsic::nvvm_div_rn_d:
323  return {Instruction::FDiv, FTZ_Any};
324  case Intrinsic::nvvm_div_rn_f:
325  return {Instruction::FDiv, FTZ_MustBeOff};
326  case Intrinsic::nvvm_div_rn_ftz_f:
327  return {Instruction::FDiv, FTZ_MustBeOn};
328 
329  // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
330  // need special handling.
331  //
332  // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
333  // as well.
334  case Intrinsic::nvvm_rcp_rn_d:
335  return {SPC_Reciprocal, FTZ_Any};
336  case Intrinsic::nvvm_rcp_rn_f:
337  return {SPC_Reciprocal, FTZ_MustBeOff};
338  case Intrinsic::nvvm_rcp_rn_ftz_f:
339  return {SPC_Reciprocal, FTZ_MustBeOn};
340 
341  // We do not currently simplify intrinsics that give an approximate
342  // answer. These include:
343  //
344  // - nvvm_cos_approx_{f,ftz_f}
345  // - nvvm_ex2_approx_{d,f,ftz_f}
346  // - nvvm_lg2_approx_{d,f,ftz_f}
347  // - nvvm_sin_approx_{f,ftz_f}
348  // - nvvm_sqrt_approx_{f,ftz_f}
349  // - nvvm_rsqrt_approx_{d,f,ftz_f}
350  // - nvvm_div_approx_{ftz_d,ftz_f,f}
351  // - nvvm_rcp_approx_ftz_d
352  //
353  // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
354  // means that fastmath is enabled in the intrinsic. Unfortunately only
355  // binary operators (currently) have a fastmath bit in SelectionDAG, so
356  // this information gets lost and we can't select on it.
357  //
358  // TODO: div and rcp are lowered to a binary op, so these we could in
359  // theory lower them to "fast fdiv".
360 
361  default:
362  return {};
363  }
364  }();
365 
366  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
367  // can bail out now. (Notice that in the case that IID is not an NVVM
368  // intrinsic, we don't have to look up any module metadata, as
369  // FtzRequirementTy will be FTZ_Any.)
370  if (Action.FtzRequirement != FTZ_Any) {
371  const char *AttrName =
372  Action.IsHalfTy ? "denormal-fp-math" : "denormal-fp-math-f32";
373  StringRef Attr =
374  II->getFunction()->getFnAttribute(AttrName).getValueAsString();
376  bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
377 
378  if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
379  return nullptr;
380  }
381 
382  // Simplify to target-generic intrinsic.
383  if (Action.IID) {
385  // All the target-generic intrinsics currently of interest to us have one
386  // type argument, equal to that of the nvvm intrinsic's argument.
387  Type *Tys[] = {II->getArgOperand(0)->getType()};
388  return CallInst::Create(
389  Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
390  }
391 
392  // Simplify to target-generic binary op.
393  if (Action.BinaryOp)
394  return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
395  II->getArgOperand(1), II->getName());
396 
397  // Simplify to target-generic cast op.
398  if (Action.CastOp)
399  return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
400  II->getName());
401 
402  // All that's left are the special cases.
403  if (!Action.Special)
404  return nullptr;
405 
406  switch (*Action.Special) {
407  case SPC_Reciprocal:
408  // Simplify reciprocal.
409  return BinaryOperator::Create(
410  Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
411  II->getArgOperand(0), II->getName());
412  }
413  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
414 }
415 
418  if (Instruction *I = simplifyNvvmIntrinsic(&II, IC)) {
419  return I;
420  }
421  return None;
422 }
423 
425  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
426  TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info,
427  TTI::OperandValueProperties Opd1PropInfo,
429  const Instruction *CxtI) {
430  // Legalize the type.
431  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
432 
433  int ISD = TLI->InstructionOpcodeToISD(Opcode);
434 
435  switch (ISD) {
436  default:
437  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
438  Opd2Info,
439  Opd1PropInfo, Opd2PropInfo);
440  case ISD::ADD:
441  case ISD::MUL:
442  case ISD::XOR:
443  case ISD::OR:
444  case ISD::AND:
445  // The machine code (SASS) simulates an i64 with two i32. Therefore, we
446  // estimate that arithmetic operations on i64 are twice as expensive as
447  // those on types that can fit into one machine register.
448  if (LT.second.SimpleTy == MVT::i64)
449  return 2 * LT.first;
450  // Delegate other cases to the basic TTI.
451  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
452  Opd2Info,
453  Opd1PropInfo, Opd2PropInfo);
454  }
455 }
456 
460  BaseT::getUnrollingPreferences(L, SE, UP, ORE);
461 
462  // Enable partial unrolling and runtime unrolling, but reduce the
463  // threshold. This partially unrolls small loops which are often
464  // unrolled by the PTX to SASS compiler and unrolling earlier can be
465  // beneficial.
466  UP.Partial = UP.Runtime = true;
467  UP.PartialThreshold = UP.Threshold / 4;
468 }
469 
472  BaseT::getPeelingPreferences(L, SE, PP);
473 }
llvm::InstructionCost
Definition: InstructionCost.h:29
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
llvm::BasicTTIImplBase< NVPTXTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:37
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:487
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:455
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
isNVVMAtomic
static bool isNVVMAtomic(const IntrinsicInst *II)
Definition: NVPTXTargetTransformInfo.cpp:39
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:65
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:667
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1410
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:530
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:53
minimum
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general either as target independent intrinsics or perhaps elsewhere in the ARM backend Some of them may also be lowered to target independent and perhaps some new SDNodes could be added For minimum
Definition: README.txt:489
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::ADDRESS_SPACE_LOCAL
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
llvm::NVPTXTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: NVPTXTargetTransformInfo.cpp:457
llvm::CastInst::Create
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", Instruction *InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Definition: Instructions.cpp:3180
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:449
llvm::BasicTTIImplBase< NVPTXTTIImpl >::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:777
ValueTracking.h
readsThreadIndex
static bool readsThreadIndex(const IntrinsicInst *II)
Definition: NVPTXTargetTransformInfo.cpp:24
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:483
llvm::NVPTXTTIImpl::getArithmeticInstrCost
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: NVPTXTargetTransformInfo.cpp:424
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:537
llvm::Optional
Definition: APInt.h:33
llvm::TargetLoweringBase::getTypeLegalizationCost
std::pair< InstructionCost, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: TargetLoweringBase.cpp:1808
NVPTXTargetTransformInfo.h
NVPTXUtilities.h
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
llvm::parseDenormalFPAttribute
DenormalMode parseDenormalFPAttribute(StringRef Str)
Returns the denormal mode to use for inputs and outputs.
Definition: FloatingPointMode.h:176
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:266
Arg
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Definition: AMDGPULibCalls.cpp:186
TargetLowering.h
llvm::BasicTTIImplBase< NVPTXTTIImpl >::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:513
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:56
llvm::CallInst::Create
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Definition: Instructions.h:1517
llvm::Function::getFnAttribute
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:651
llvm::Instruction::CastOps
CastOps
Definition: Instruction.h:800
llvm::Instruction
Definition: Instruction.h:42
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
llvm::Attribute::getValueAsString
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:305
llvm::None
const NoneType None
Definition: None.h:24
llvm::maxnum
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE maxNum semantics.
Definition: APFloat.h:1306
LoopInfo.h
llvm::isKernelFunction
bool isKernelFunction(const Function &F)
Definition: NVPTXUtilities.cpp:274
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:430
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:898
llvm::DenormalMode
Represent subnormal handling kind for floating point instruction inputs and outputs.
Definition: FloatingPointMode.h:69
maximum
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general either as target independent intrinsics or perhaps elsewhere in the ARM backend Some of them may also be lowered to target independent and perhaps some new SDNodes could be added For maximum
Definition: README.txt:489
readsLaneId
static bool readsLaneId(const IntrinsicInst *II)
Definition: NVPTXTargetTransformInfo.cpp:34
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::DenormalMode::IEEE
@ IEEE
IEEE-754 denormal numbers preserved.
Definition: FloatingPointMode.h:76
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:890
llvm::NVPTXTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: NVPTXTargetTransformInfo.cpp:470
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
simplifyNvvmIntrinsic
static Instruction * simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC)
Definition: NVPTXTargetTransformInfo.cpp:115
llvm::OptimizationRemarkEmitter
The optimization diagnostic interface.
Definition: OptimizationRemarkEmitter.h:33
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:47
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Instruction::getFunction
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:69
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1728
llvm::ADDRESS_SPACE_GENERIC
@ ADDRESS_SPACE_GENERIC
Definition: NVPTXBaseInfo.h:22
llvm::BasicTTIImplBase< NVPTXTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:585
llvm::Value::getName
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:305
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:176
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:668
llvm::minnum
LLVM_READONLY APFloat minnum(const APFloat &A, const APFloat &B)
Implements IEEE minNum semantics.
Definition: APFloat.h:1295
CostTable.h
llvm::ConstantFP::get
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
Definition: Constants.cpp:966
llvm::InstCombiner
The core instruction combiner logic.
Definition: InstCombiner.h:45
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:46
llvm::Instruction::BinaryOps
BinaryOps
Definition: Instruction.h:786
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:241
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:438
TargetTransformInfo.h
llvm::NVPTXTTIImpl::instCombineIntrinsic
Optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
Definition: NVPTXTargetTransformInfo.cpp:417
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:394
BasicTTIImpl.h
llvm::NVPTXTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V)
Definition: NVPTXTargetTransformInfo.cpp:71
llvm::BinaryOperator::Create
static BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), Instruction *InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
Definition: Instructions.cpp:2778
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
Debug.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1332
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37