LLVM  12.0.0git
NVPTXTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
10 #include "NVPTXUtilities.h"
11 #include "llvm/Analysis/LoopInfo.h"
15 #include "llvm/CodeGen/CostTable.h"
17 #include "llvm/IR/IntrinsicsNVPTX.h"
18 #include "llvm/Support/Debug.h"
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "NVPTXtti"
22 
23 // Whether the given intrinsic reads threadIdx.x/y/z.
24 static bool readsThreadIndex(const IntrinsicInst *II) {
25  switch (II->getIntrinsicID()) {
26  default: return false;
27  case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28  case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29  case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30  return true;
31  }
32 }
33 
34 static bool readsLaneId(const IntrinsicInst *II) {
35  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36 }
37 
38 // Whether the given intrinsic is an atomic instruction in PTX.
39 static bool isNVVMAtomic(const IntrinsicInst *II) {
40  switch (II->getIntrinsicID()) {
41  default: return false;
42  case Intrinsic::nvvm_atomic_load_inc_32:
43  case Intrinsic::nvvm_atomic_load_dec_32:
44 
45  case Intrinsic::nvvm_atomic_add_gen_f_cta:
46  case Intrinsic::nvvm_atomic_add_gen_f_sys:
47  case Intrinsic::nvvm_atomic_add_gen_i_cta:
48  case Intrinsic::nvvm_atomic_add_gen_i_sys:
49  case Intrinsic::nvvm_atomic_and_gen_i_cta:
50  case Intrinsic::nvvm_atomic_and_gen_i_sys:
51  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
52  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
53  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
54  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
55  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
56  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
57  case Intrinsic::nvvm_atomic_max_gen_i_cta:
58  case Intrinsic::nvvm_atomic_max_gen_i_sys:
59  case Intrinsic::nvvm_atomic_min_gen_i_cta:
60  case Intrinsic::nvvm_atomic_min_gen_i_sys:
61  case Intrinsic::nvvm_atomic_or_gen_i_cta:
62  case Intrinsic::nvvm_atomic_or_gen_i_sys:
63  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
64  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
65  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
66  case Intrinsic::nvvm_atomic_xor_gen_i_sys:
67  return true;
68  }
69 }
70 
72  // Without inter-procedural analysis, we conservatively assume that arguments
73  // to __device__ functions are divergent.
74  if (const Argument *Arg = dyn_cast<Argument>(V))
75  return !isKernelFunction(*Arg->getParent());
76 
77  if (const Instruction *I = dyn_cast<Instruction>(V)) {
78  // Without pointer analysis, we conservatively assume values loaded from
79  // generic or local address space are divergent.
80  if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
81  unsigned AS = LI->getPointerAddressSpace();
82  return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
83  }
84  // Atomic instructions may cause divergence. Atomic instructions are
85  // executed sequentially across all threads in a warp. Therefore, an earlier
86  // executed thread may see different memory inputs than a later executed
87  // thread. For example, suppose *a = 0 initially.
88  //
89  // atom.global.add.s32 d, [a], 1
90  //
91  // returns 0 for the first thread that enters the critical region, and 1 for
92  // the second thread.
93  if (I->isAtomic())
94  return true;
95  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
96  // Instructions that read threadIdx are obviously divergent.
97  if (readsThreadIndex(II) || readsLaneId(II))
98  return true;
99  // Handle the NVPTX atomic instrinsics that cannot be represented as an
100  // atomic IR instruction.
101  if (isNVVMAtomic(II))
102  return true;
103  }
104  // Conservatively consider the return value of function calls as divergent.
105  // We could analyze callees with bodies more precisely using
106  // inter-procedural analysis.
107  if (isa<CallInst>(I))
108  return true;
109  }
110 
111  return false;
112 }
113 
115  unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
116  TTI::OperandValueKind Opd1Info,
117  TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
119  const Instruction *CxtI) {
120  // Legalize the type.
121  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
122 
123  int ISD = TLI->InstructionOpcodeToISD(Opcode);
124 
125  switch (ISD) {
126  default:
127  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
128  Opd2Info,
129  Opd1PropInfo, Opd2PropInfo);
130  case ISD::ADD:
131  case ISD::MUL:
132  case ISD::XOR:
133  case ISD::OR:
134  case ISD::AND:
135  // The machine code (SASS) simulates an i64 with two i32. Therefore, we
136  // estimate that arithmetic operations on i64 are twice as expensive as
137  // those on types that can fit into one machine register.
138  if (LT.second.SimpleTy == MVT::i64)
139  return 2 * LT.first;
140  // Delegate other cases to the basic TTI.
141  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
142  Opd2Info,
143  Opd1PropInfo, Opd2PropInfo);
144  }
145 }
146 
150 
151  // Enable partial unrolling and runtime unrolling, but reduce the
152  // threshold. This partially unrolls small loops which are often
153  // unrolled by the PTX to SASS compiler and unrolling earlier can be
154  // beneficial.
155  UP.Partial = UP.Runtime = true;
156  UP.PartialThreshold = UP.Threshold / 4;
157 }
158 
161  BaseT::getPeelingPreferences(L, SE, PP);
162 }
static bool readsThreadIndex(const IntrinsicInst *II)
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
This class represents lattice values for constants.
Definition: AllocatorList.h:23
Cost tables and simple lookup functions.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:454
The main scalar evolution driver.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
static bool readsLaneId(const IntrinsicInst *II)
An instruction for reading from memory.
Definition: Instructions.h:173
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size")))
bool isKernelFunction(const Function &F)
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:223
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: BasicTTIImpl.h:392
This file a TargetTransformInfo::Concept conforming object specific to the NVPTX target machine...
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value *> Args=ArrayRef< const Value *>(), const Instruction *CxtI=nullptr)
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:46
bool isSourceOfDivergence(const Value *V)
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
OperandValueProperties
Additional properties of an operand&#39;s values.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:551
unsigned Threshold
The cost threshold for the unrolled loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:516
Parameters that control the generic loop unrolling transformation.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
#define I(x, y, z)
Definition: MD5.cpp:59
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:615
static bool isNVVMAtomic(const IntrinsicInst *II)
LLVM Value Representation.
Definition: Value.h:74
OperandValueKind
Additional information about an operand&#39;s possible values.
This pass exposes codegen information to IR-level passes.
TargetCostKind
The kind of cost model.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
This file describes how to lower LLVM code to machine code.