Bug Summary

File:lib/Target/AMDGPU/SIISelLowering.cpp
Warning:line 8848, column 20
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SIISelLowering.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -mrelocation-model pic -pic-level 2 -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -ffunction-sections -fdata-sections -resource-dir /usr/lib/llvm-8/lib/clang/8.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-8~svn345461/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-8~svn345461/build-llvm/include -I /build/llvm-toolchain-snapshot-8~svn345461/include -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/x86_64-linux-gnu/c++/6.3.0 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/6.3.0/../../../../include/c++/6.3.0/backward -internal-isystem /usr/include/clang/8.0.0/include/ -internal-isystem /usr/local/include -internal-isystem /usr/lib/llvm-8/lib/clang/8.0.0/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-comment -std=c++11 -fdeprecated-macro -fdebug-compilation-dir /build/llvm-toolchain-snapshot-8~svn345461/build-llvm/lib/Target/AMDGPU -ferror-limit 19 -fmessage-length 0 -fvisibility-inlines-hidden -fobjc-runtime=gcc -fdiagnostics-show-option -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -o /tmp/scan-build-2018-10-27-211344-32123-1 -x c++ /build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp -faddrsig

/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp

1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// Custom DAG lowering for SI
12//
13//===----------------------------------------------------------------------===//
14
15#ifdef _MSC_VER
16// Provide M_PI.
17#define _USE_MATH_DEFINES
18#endif
19
20#include "SIISelLowering.h"
21#include "AMDGPU.h"
22#include "AMDGPUIntrinsicInfo.h"
23#include "AMDGPUSubtarget.h"
24#include "AMDGPUTargetMachine.h"
25#include "SIDefines.h"
26#include "SIInstrInfo.h"
27#include "SIMachineFunctionInfo.h"
28#include "SIRegisterInfo.h"
29#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
30#include "Utils/AMDGPUBaseInfo.h"
31#include "llvm/ADT/APFloat.h"
32#include "llvm/ADT/APInt.h"
33#include "llvm/ADT/ArrayRef.h"
34#include "llvm/ADT/BitVector.h"
35#include "llvm/ADT/SmallVector.h"
36#include "llvm/ADT/Statistic.h"
37#include "llvm/ADT/StringRef.h"
38#include "llvm/ADT/StringSwitch.h"
39#include "llvm/ADT/Twine.h"
40#include "llvm/CodeGen/Analysis.h"
41#include "llvm/CodeGen/CallingConvLower.h"
42#include "llvm/CodeGen/DAGCombine.h"
43#include "llvm/CodeGen/ISDOpcodes.h"
44#include "llvm/CodeGen/MachineBasicBlock.h"
45#include "llvm/CodeGen/MachineFrameInfo.h"
46#include "llvm/CodeGen/MachineFunction.h"
47#include "llvm/CodeGen/MachineInstr.h"
48#include "llvm/CodeGen/MachineInstrBuilder.h"
49#include "llvm/CodeGen/MachineMemOperand.h"
50#include "llvm/CodeGen/MachineModuleInfo.h"
51#include "llvm/CodeGen/MachineOperand.h"
52#include "llvm/CodeGen/MachineRegisterInfo.h"
53#include "llvm/CodeGen/SelectionDAG.h"
54#include "llvm/CodeGen/SelectionDAGNodes.h"
55#include "llvm/CodeGen/TargetCallingConv.h"
56#include "llvm/CodeGen/TargetRegisterInfo.h"
57#include "llvm/CodeGen/ValueTypes.h"
58#include "llvm/IR/Constants.h"
59#include "llvm/IR/DataLayout.h"
60#include "llvm/IR/DebugLoc.h"
61#include "llvm/IR/DerivedTypes.h"
62#include "llvm/IR/DiagnosticInfo.h"
63#include "llvm/IR/Function.h"
64#include "llvm/IR/GlobalValue.h"
65#include "llvm/IR/InstrTypes.h"
66#include "llvm/IR/Instruction.h"
67#include "llvm/IR/Instructions.h"
68#include "llvm/IR/IntrinsicInst.h"
69#include "llvm/IR/Type.h"
70#include "llvm/Support/Casting.h"
71#include "llvm/Support/CodeGen.h"
72#include "llvm/Support/CommandLine.h"
73#include "llvm/Support/Compiler.h"
74#include "llvm/Support/ErrorHandling.h"
75#include "llvm/Support/KnownBits.h"
76#include "llvm/Support/MachineValueType.h"
77#include "llvm/Support/MathExtras.h"
78#include "llvm/Target/TargetOptions.h"
79#include <cassert>
80#include <cmath>
81#include <cstdint>
82#include <iterator>
83#include <tuple>
84#include <utility>
85#include <vector>
86
87using namespace llvm;
88
89#define DEBUG_TYPE"si-lower" "si-lower"
90
91STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"si-lower", "NumTailCalls"
, "Number of tail calls", {0}, {false}}
;
92
93static cl::opt<bool> EnableVGPRIndexMode(
94 "amdgpu-vgpr-index-mode",
95 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
96 cl::init(false));
97
98static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
99 "amdgpu-frame-index-zero-bits",
100 cl::desc("High bits of frame index assumed to be zero"),
101 cl::init(5),
102 cl::ReallyHidden);
103
104static unsigned findFirstFreeSGPR(CCState &CCInfo) {
105 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
106 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
107 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
108 return AMDGPU::SGPR0 + Reg;
109 }
110 }
111 llvm_unreachable("Cannot allocate sgpr")::llvm::llvm_unreachable_internal("Cannot allocate sgpr", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 111)
;
112}
113
114SITargetLowering::SITargetLowering(const TargetMachine &TM,
115 const GCNSubtarget &STI)
116 : AMDGPUTargetLowering(TM, STI),
117 Subtarget(&STI) {
118 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
119 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
120
121 addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
122 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
123
124 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
125 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
126 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
127
128 addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129 addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130
131 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
135 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
136
137 addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
138 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
139
140 if (Subtarget->has16BitInsts()) {
141 addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
142 addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
143
144 // Unless there are also VOP3P operations, not operations are really legal.
145 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
146 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
147 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
148 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
149 }
150
151 computeRegisterProperties(Subtarget->getRegisterInfo());
152
153 // We need to custom lower vector stores from local memory
154 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
155 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
156 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
157 setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
158 setOperationAction(ISD::LOAD, MVT::i1, Custom);
159 setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
160
161 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
162 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
163 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
164 setOperationAction(ISD::STORE, MVT::v16i32, Custom);
165 setOperationAction(ISD::STORE, MVT::i1, Custom);
166 setOperationAction(ISD::STORE, MVT::v32i32, Custom);
167
168 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
169 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
170 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
171 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
172 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
173 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
174 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
175 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
176 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
177 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
178
179 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
180 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
181
182 setOperationAction(ISD::SELECT, MVT::i1, Promote);
183 setOperationAction(ISD::SELECT, MVT::i64, Custom);
184 setOperationAction(ISD::SELECT, MVT::f64, Promote);
185 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
186
187 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
188 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
189 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
190 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
191 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
192
193 setOperationAction(ISD::SETCC, MVT::i1, Promote);
194 setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
195 setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
196 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
197
198 setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
199 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
200
201 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
202 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
203 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
204 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
205 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
206 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
207 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
208
209 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
210 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
211 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
212 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
213 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
214 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
215 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
216
217 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
218 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
219 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
220
221 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
222 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
223 setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
224 setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
225
226 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
227 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
228 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
229 setOperationAction(ISD::BR_CC, MVT::i64, Expand);
230 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
231 setOperationAction(ISD::BR_CC, MVT::f64, Expand);
232
233 setOperationAction(ISD::UADDO, MVT::i32, Legal);
234 setOperationAction(ISD::USUBO, MVT::i32, Legal);
235
236 setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
237 setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
238
239 setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
240 setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
241 setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
242
243#if 0
244 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
245 setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
246#endif
247
248 // We only support LOAD/STORE and vector manipulation ops for vectors
249 // with > 4 elements.
250 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
251 MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
252 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
253 switch (Op) {
254 case ISD::LOAD:
255 case ISD::STORE:
256 case ISD::BUILD_VECTOR:
257 case ISD::BITCAST:
258 case ISD::EXTRACT_VECTOR_ELT:
259 case ISD::INSERT_VECTOR_ELT:
260 case ISD::INSERT_SUBVECTOR:
261 case ISD::EXTRACT_SUBVECTOR:
262 case ISD::SCALAR_TO_VECTOR:
263 break;
264 case ISD::CONCAT_VECTORS:
265 setOperationAction(Op, VT, Custom);
266 break;
267 default:
268 setOperationAction(Op, VT, Expand);
269 break;
270 }
271 }
272 }
273
274 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
275
276 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
277 // is expanded to avoid having two separate loops in case the index is a VGPR.
278
279 // Most operations are naturally 32-bit vector operations. We only support
280 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
281 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
282 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
283 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
284
285 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
286 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
287
288 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
289 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
290
291 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
292 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
293 }
294
295 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
296 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
297 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
298 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
299
300 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
301 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
302
303 // Avoid stack access for these.
304 // TODO: Generalize to more vector types.
305 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
306 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
307 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
308 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
309
310 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
311 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
312 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
313 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
314 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
315
316 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
317 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
318 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
319
320 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
321 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
322 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
323 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
324
325 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
326 // and output demarshalling
327 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
328 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
329
330 // We can't return success/failure, only the old value,
331 // let LLVM add the comparison
332 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
333 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
334
335 if (Subtarget->hasFlatAddressSpace()) {
336 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
337 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
338 }
339
340 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
341 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
342
343 // On SI this is s_memtime and s_memrealtime on VI.
344 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
345 setOperationAction(ISD::TRAP, MVT::Other, Custom);
346 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
347
348 if (Subtarget->has16BitInsts()) {
349 setOperationAction(ISD::FLOG, MVT::f16, Custom);
350 setOperationAction(ISD::FEXP, MVT::f16, Custom);
351 setOperationAction(ISD::FLOG10, MVT::f16, Custom);
352 }
353
354 // v_mad_f32 does not support denormals according to some sources.
355 if (!Subtarget->hasFP32Denormals())
356 setOperationAction(ISD::FMAD, MVT::f32, Legal);
357
358 if (!Subtarget->hasBFI()) {
359 // fcopysign can be done in a single instruction with BFI.
360 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
361 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
362 }
363
364 if (!Subtarget->hasBCNT(32))
365 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
366
367 if (!Subtarget->hasBCNT(64))
368 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
369
370 if (Subtarget->hasFFBH())
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
372
373 if (Subtarget->hasFFBL())
374 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
375
376 // We only really have 32-bit BFE instructions (and 16-bit on VI).
377 //
378 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
379 // effort to match them now. We want this to be false for i64 cases when the
380 // extraction isn't restricted to the upper or lower half. Ideally we would
381 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
382 // span the midpoint are probably relatively rare, so don't worry about them
383 // for now.
384 if (Subtarget->hasBFE())
385 setHasExtractBitsInsn(true);
386
387 setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
388 setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
389 setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
390 setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
391
392
393 // These are really only legal for ieee_mode functions. We should be avoiding
394 // them for functions that don't have ieee_mode enabled, so just say they are
395 // legal.
396 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
397 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
398 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
399 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
400
401
402 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
403 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
404 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
405 setOperationAction(ISD::FRINT, MVT::f64, Legal);
406 } else {
407 setOperationAction(ISD::FCEIL, MVT::f64, Custom);
408 setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
409 setOperationAction(ISD::FRINT, MVT::f64, Custom);
410 setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
411 }
412
413 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
414
415 setOperationAction(ISD::FSIN, MVT::f32, Custom);
416 setOperationAction(ISD::FCOS, MVT::f32, Custom);
417 setOperationAction(ISD::FDIV, MVT::f32, Custom);
418 setOperationAction(ISD::FDIV, MVT::f64, Custom);
419
420 if (Subtarget->has16BitInsts()) {
421 setOperationAction(ISD::Constant, MVT::i16, Legal);
422
423 setOperationAction(ISD::SMIN, MVT::i16, Legal);
424 setOperationAction(ISD::SMAX, MVT::i16, Legal);
425
426 setOperationAction(ISD::UMIN, MVT::i16, Legal);
427 setOperationAction(ISD::UMAX, MVT::i16, Legal);
428
429 setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
430 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
431
432 setOperationAction(ISD::ROTR, MVT::i16, Promote);
433 setOperationAction(ISD::ROTL, MVT::i16, Promote);
434
435 setOperationAction(ISD::SDIV, MVT::i16, Promote);
436 setOperationAction(ISD::UDIV, MVT::i16, Promote);
437 setOperationAction(ISD::SREM, MVT::i16, Promote);
438 setOperationAction(ISD::UREM, MVT::i16, Promote);
439
440 setOperationAction(ISD::BSWAP, MVT::i16, Promote);
441 setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
442
443 setOperationAction(ISD::CTTZ, MVT::i16, Promote);
444 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
445 setOperationAction(ISD::CTLZ, MVT::i16, Promote);
446 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
447 setOperationAction(ISD::CTPOP, MVT::i16, Promote);
448
449 setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
450
451 setOperationAction(ISD::BR_CC, MVT::i16, Expand);
452
453 setOperationAction(ISD::LOAD, MVT::i16, Custom);
454
455 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
456
457 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
458 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
459 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
460 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
461
462 setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
463 setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
464 setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
465 setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
466
467 // F16 - Constant Actions.
468 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
469
470 // F16 - Load/Store Actions.
471 setOperationAction(ISD::LOAD, MVT::f16, Promote);
472 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
473 setOperationAction(ISD::STORE, MVT::f16, Promote);
474 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
475
476 // F16 - VOP1 Actions.
477 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
478 setOperationAction(ISD::FCOS, MVT::f16, Promote);
479 setOperationAction(ISD::FSIN, MVT::f16, Promote);
480 setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
481 setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
482 setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
483 setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
484 setOperationAction(ISD::FROUND, MVT::f16, Custom);
485
486 // F16 - VOP2 Actions.
487 setOperationAction(ISD::BR_CC, MVT::f16, Expand);
488 setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
489
490 setOperationAction(ISD::FDIV, MVT::f16, Custom);
491
492 // F16 - VOP3 Actions.
493 setOperationAction(ISD::FMA, MVT::f16, Legal);
494 if (!Subtarget->hasFP16Denormals())
495 setOperationAction(ISD::FMAD, MVT::f16, Legal);
496
497 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
498 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
499 switch (Op) {
500 case ISD::LOAD:
501 case ISD::STORE:
502 case ISD::BUILD_VECTOR:
503 case ISD::BITCAST:
504 case ISD::EXTRACT_VECTOR_ELT:
505 case ISD::INSERT_VECTOR_ELT:
506 case ISD::INSERT_SUBVECTOR:
507 case ISD::EXTRACT_SUBVECTOR:
508 case ISD::SCALAR_TO_VECTOR:
509 break;
510 case ISD::CONCAT_VECTORS:
511 setOperationAction(Op, VT, Custom);
512 break;
513 default:
514 setOperationAction(Op, VT, Expand);
515 break;
516 }
517 }
518 }
519
520 // XXX - Do these do anything? Vector constants turn into build_vector.
521 setOperationAction(ISD::Constant, MVT::v2i16, Legal);
522 setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
523
524 setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
525 setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
526
527 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
528 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
529 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
530 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
531
532 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
533 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
534 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
535 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
536
537 setOperationAction(ISD::AND, MVT::v2i16, Promote);
538 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
539 setOperationAction(ISD::OR, MVT::v2i16, Promote);
540 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
541 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
542 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
543
544 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
545 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
546 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
547 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
548
549 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
550 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
551 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
552 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
553
554 setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
555 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
556 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
557 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
558
559 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
560 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
561 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
562
563 if (!Subtarget->hasVOP3PInsts()) {
564 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
565 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
566 }
567
568 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
569 // This isn't really legal, but this avoids the legalizer unrolling it (and
570 // allows matching fneg (fabs x) patterns)
571 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
572
573 setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
574 setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
575 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
576 setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
577
578 setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
579 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
580
581 setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
582 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
583 }
584
585 if (Subtarget->hasVOP3PInsts()) {
586 setOperationAction(ISD::ADD, MVT::v2i16, Legal);
587 setOperationAction(ISD::SUB, MVT::v2i16, Legal);
588 setOperationAction(ISD::MUL, MVT::v2i16, Legal);
589 setOperationAction(ISD::SHL, MVT::v2i16, Legal);
590 setOperationAction(ISD::SRL, MVT::v2i16, Legal);
591 setOperationAction(ISD::SRA, MVT::v2i16, Legal);
592 setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
593 setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
594 setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
595 setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
596
597 setOperationAction(ISD::FADD, MVT::v2f16, Legal);
598 setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
599 setOperationAction(ISD::FMA, MVT::v2f16, Legal);
600
601 setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
602 setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
603
604 setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
605
606 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
607 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
608
609 setOperationAction(ISD::SHL, MVT::v4i16, Custom);
610 setOperationAction(ISD::SRA, MVT::v4i16, Custom);
611 setOperationAction(ISD::SRL, MVT::v4i16, Custom);
612 setOperationAction(ISD::ADD, MVT::v4i16, Custom);
613 setOperationAction(ISD::SUB, MVT::v4i16, Custom);
614 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
615
616 setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
617 setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
618 setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
619 setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
620
621 setOperationAction(ISD::FADD, MVT::v4f16, Custom);
622 setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
623
624 setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
625 setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
626
627 setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
628 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
629 setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
630
631 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
632 setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
633 setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
634 }
635
636 setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
637 setOperationAction(ISD::FABS, MVT::v4f16, Custom);
638
639 if (Subtarget->has16BitInsts()) {
640 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
641 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
642 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
643 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
644 } else {
645 // Legalization hack.
646 setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
647 setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
648
649 setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
650 setOperationAction(ISD::FABS, MVT::v2f16, Custom);
651 }
652
653 for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
654 setOperationAction(ISD::SELECT, VT, Custom);
655 }
656
657 setTargetDAGCombine(ISD::ADD);
658 setTargetDAGCombine(ISD::ADDCARRY);
659 setTargetDAGCombine(ISD::SUB);
660 setTargetDAGCombine(ISD::SUBCARRY);
661 setTargetDAGCombine(ISD::FADD);
662 setTargetDAGCombine(ISD::FSUB);
663 setTargetDAGCombine(ISD::FMINNUM);
664 setTargetDAGCombine(ISD::FMAXNUM);
665 setTargetDAGCombine(ISD::FMINNUM_IEEE);
666 setTargetDAGCombine(ISD::FMAXNUM_IEEE);
667 setTargetDAGCombine(ISD::FMA);
668 setTargetDAGCombine(ISD::SMIN);
669 setTargetDAGCombine(ISD::SMAX);
670 setTargetDAGCombine(ISD::UMIN);
671 setTargetDAGCombine(ISD::UMAX);
672 setTargetDAGCombine(ISD::SETCC);
673 setTargetDAGCombine(ISD::AND);
674 setTargetDAGCombine(ISD::OR);
675 setTargetDAGCombine(ISD::XOR);
676 setTargetDAGCombine(ISD::SINT_TO_FP);
677 setTargetDAGCombine(ISD::UINT_TO_FP);
678 setTargetDAGCombine(ISD::FCANONICALIZE);
679 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
680 setTargetDAGCombine(ISD::ZERO_EXTEND);
681 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
682 setTargetDAGCombine(ISD::BUILD_VECTOR);
683
684 // All memory operations. Some folding on the pointer operand is done to help
685 // matching the constant offsets in the addressing modes.
686 setTargetDAGCombine(ISD::LOAD);
687 setTargetDAGCombine(ISD::STORE);
688 setTargetDAGCombine(ISD::ATOMIC_LOAD);
689 setTargetDAGCombine(ISD::ATOMIC_STORE);
690 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
691 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
692 setTargetDAGCombine(ISD::ATOMIC_SWAP);
693 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
694 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
695 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
696 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
697 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
698 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
699 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
700 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
701 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
702 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
703
704 setSchedulingPreference(Sched::RegPressure);
705
706 // SI at least has hardware support for floating point exceptions, but no way
707 // of using or handling them is implemented. They are also optional in OpenCL
708 // (Section 7.3)
709 setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
710}
711
712const GCNSubtarget *SITargetLowering::getSubtarget() const {
713 return Subtarget;
714}
715
716//===----------------------------------------------------------------------===//
717// TargetLowering queries
718//===----------------------------------------------------------------------===//
719
720// v_mad_mix* support a conversion from f16 to f32.
721//
722// There is only one special case when denormals are enabled we don't currently,
723// where this is OK to use.
724bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
725 EVT DestVT, EVT SrcVT) const {
726 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
727 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
728 DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
729 SrcVT.getScalarType() == MVT::f16;
730}
731
732bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
733 // SI has some legal vector types, but no legal vector operations. Say no
734 // shuffles are legal in order to prefer scalarizing some vector operations.
735 return false;
736}
737
738MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
739 CallingConv::ID CC,
740 EVT VT) const {
741 // TODO: Consider splitting all arguments into 32-bit pieces.
742 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
743 EVT ScalarVT = VT.getScalarType();
744 unsigned Size = ScalarVT.getSizeInBits();
745 if (Size == 32)
746 return ScalarVT.getSimpleVT();
747
748 if (Size == 64)
749 return MVT::i32;
750
751 if (Size == 16 && Subtarget->has16BitInsts())
752 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
753 }
754
755 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
756}
757
758unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
759 CallingConv::ID CC,
760 EVT VT) const {
761 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
762 unsigned NumElts = VT.getVectorNumElements();
763 EVT ScalarVT = VT.getScalarType();
764 unsigned Size = ScalarVT.getSizeInBits();
765
766 if (Size == 32)
767 return NumElts;
768
769 if (Size == 64)
770 return 2 * NumElts;
771
772 if (Size == 16 && Subtarget->has16BitInsts())
773 return (VT.getVectorNumElements() + 1) / 2;
774 }
775
776 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
777}
778
779unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
780 LLVMContext &Context, CallingConv::ID CC,
781 EVT VT, EVT &IntermediateVT,
782 unsigned &NumIntermediates, MVT &RegisterVT) const {
783 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
784 unsigned NumElts = VT.getVectorNumElements();
785 EVT ScalarVT = VT.getScalarType();
786 unsigned Size = ScalarVT.getSizeInBits();
787 if (Size == 32) {
788 RegisterVT = ScalarVT.getSimpleVT();
789 IntermediateVT = RegisterVT;
790 NumIntermediates = NumElts;
791 return NumIntermediates;
792 }
793
794 if (Size == 64) {
795 RegisterVT = MVT::i32;
796 IntermediateVT = RegisterVT;
797 NumIntermediates = 2 * NumElts;
798 return NumIntermediates;
799 }
800
801 // FIXME: We should fix the ABI to be the same on targets without 16-bit
802 // support, but unless we can properly handle 3-vectors, it will be still be
803 // inconsistent.
804 if (Size == 16 && Subtarget->has16BitInsts()) {
805 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
806 IntermediateVT = RegisterVT;
807 NumIntermediates = (NumElts + 1) / 2;
808 return NumIntermediates;
809 }
810 }
811
812 return TargetLowering::getVectorTypeBreakdownForCallingConv(
813 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
814}
815
816bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
817 const CallInst &CI,
818 MachineFunction &MF,
819 unsigned IntrID) const {
820 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
821 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
822 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
823 (Intrinsic::ID)IntrID);
824 if (Attr.hasFnAttribute(Attribute::ReadNone))
825 return false;
826
827 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
828
829 if (RsrcIntr->IsImage) {
830 Info.ptrVal = MFI->getImagePSV(
831 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
832 CI.getArgOperand(RsrcIntr->RsrcArg));
833 Info.align = 0;
834 } else {
835 Info.ptrVal = MFI->getBufferPSV(
836 *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
837 CI.getArgOperand(RsrcIntr->RsrcArg));
838 }
839
840 Info.flags = MachineMemOperand::MODereferenceable;
841 if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
842 Info.opc = ISD::INTRINSIC_W_CHAIN;
843 Info.memVT = MVT::getVT(CI.getType());
844 Info.flags |= MachineMemOperand::MOLoad;
845 } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
846 Info.opc = ISD::INTRINSIC_VOID;
847 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
848 Info.flags |= MachineMemOperand::MOStore;
849 } else {
850 // Atomic
851 Info.opc = ISD::INTRINSIC_W_CHAIN;
852 Info.memVT = MVT::getVT(CI.getType());
853 Info.flags = MachineMemOperand::MOLoad |
854 MachineMemOperand::MOStore |
855 MachineMemOperand::MODereferenceable;
856
857 // XXX - Should this be volatile without known ordering?
858 Info.flags |= MachineMemOperand::MOVolatile;
859 }
860 return true;
861 }
862
863 switch (IntrID) {
864 case Intrinsic::amdgcn_atomic_inc:
865 case Intrinsic::amdgcn_atomic_dec:
866 case Intrinsic::amdgcn_ds_fadd:
867 case Intrinsic::amdgcn_ds_fmin:
868 case Intrinsic::amdgcn_ds_fmax: {
869 Info.opc = ISD::INTRINSIC_W_CHAIN;
870 Info.memVT = MVT::getVT(CI.getType());
871 Info.ptrVal = CI.getOperand(0);
872 Info.align = 0;
873 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
874
875 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
876 if (!Vol || !Vol->isZero())
877 Info.flags |= MachineMemOperand::MOVolatile;
878
879 return true;
880 }
881
882 default:
883 return false;
884 }
885}
886
887bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
888 SmallVectorImpl<Value*> &Ops,
889 Type *&AccessTy) const {
890 switch (II->getIntrinsicID()) {
891 case Intrinsic::amdgcn_atomic_inc:
892 case Intrinsic::amdgcn_atomic_dec:
893 case Intrinsic::amdgcn_ds_fadd:
894 case Intrinsic::amdgcn_ds_fmin:
895 case Intrinsic::amdgcn_ds_fmax: {
896 Value *Ptr = II->getArgOperand(0);
897 AccessTy = II->getType();
898 Ops.push_back(Ptr);
899 return true;
900 }
901 default:
902 return false;
903 }
904}
905
906bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
907 if (!Subtarget->hasFlatInstOffsets()) {
908 // Flat instructions do not have offsets, and only have the register
909 // address.
910 return AM.BaseOffs == 0 && AM.Scale == 0;
911 }
912
913 // GFX9 added a 13-bit signed offset. When using regular flat instructions,
914 // the sign bit is ignored and is treated as a 12-bit unsigned offset.
915
916 // Just r + i
917 return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
918}
919
920bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
921 if (Subtarget->hasFlatGlobalInsts())
922 return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
923
924 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
925 // Assume the we will use FLAT for all global memory accesses
926 // on VI.
927 // FIXME: This assumption is currently wrong. On VI we still use
928 // MUBUF instructions for the r + i addressing mode. As currently
929 // implemented, the MUBUF instructions only work on buffer < 4GB.
930 // It may be possible to support > 4GB buffers with MUBUF instructions,
931 // by setting the stride value in the resource descriptor which would
932 // increase the size limit to (stride * 4GB). However, this is risky,
933 // because it has never been validated.
934 return isLegalFlatAddressingMode(AM);
935 }
936
937 return isLegalMUBUFAddressingMode(AM);
938}
939
940bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
941 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
942 // additionally can do r + r + i with addr64. 32-bit has more addressing
943 // mode options. Depending on the resource constant, it can also do
944 // (i64 r0) + (i32 r1) * (i14 i).
945 //
946 // Private arrays end up using a scratch buffer most of the time, so also
947 // assume those use MUBUF instructions. Scratch loads / stores are currently
948 // implemented as mubuf instructions with offen bit set, so slightly
949 // different than the normal addr64.
950 if (!isUInt<12>(AM.BaseOffs))
951 return false;
952
953 // FIXME: Since we can split immediate into soffset and immediate offset,
954 // would it make sense to allow any immediate?
955
956 switch (AM.Scale) {
957 case 0: // r + i or just i, depending on HasBaseReg.
958 return true;
959 case 1:
960 return true; // We have r + r or r + i.
961 case 2:
962 if (AM.HasBaseReg) {
963 // Reject 2 * r + r.
964 return false;
965 }
966
967 // Allow 2 * r as r + r
968 // Or 2 * r + i is allowed as r + r + i.
969 return true;
970 default: // Don't allow n * r
971 return false;
972 }
973}
974
975bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
976 const AddrMode &AM, Type *Ty,
977 unsigned AS, Instruction *I) const {
978 // No global is ever allowed as a base.
979 if (AM.BaseGV)
980 return false;
981
982 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
983 return isLegalGlobalAddressingMode(AM);
984
985 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
986 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
987 // If the offset isn't a multiple of 4, it probably isn't going to be
988 // correctly aligned.
989 // FIXME: Can we get the real alignment here?
990 if (AM.BaseOffs % 4 != 0)
991 return isLegalMUBUFAddressingMode(AM);
992
993 // There are no SMRD extloads, so if we have to do a small type access we
994 // will use a MUBUF load.
995 // FIXME?: We also need to do this if unaligned, but we don't know the
996 // alignment here.
997 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
998 return isLegalGlobalAddressingMode(AM);
999
1000 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1001 // SMRD instructions have an 8-bit, dword offset on SI.
1002 if (!isUInt<8>(AM.BaseOffs / 4))
1003 return false;
1004 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1005 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1006 // in 8-bits, it can use a smaller encoding.
1007 if (!isUInt<32>(AM.BaseOffs / 4))
1008 return false;
1009 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1010 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1011 if (!isUInt<20>(AM.BaseOffs))
1012 return false;
1013 } else
1014 llvm_unreachable("unhandled generation")::llvm::llvm_unreachable_internal("unhandled generation", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1014)
;
1015
1016 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1017 return true;
1018
1019 if (AM.Scale == 1 && AM.HasBaseReg)
1020 return true;
1021
1022 return false;
1023
1024 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1025 return isLegalMUBUFAddressingMode(AM);
1026 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1027 AS == AMDGPUAS::REGION_ADDRESS) {
1028 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1029 // field.
1030 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1031 // an 8-bit dword offset but we don't know the alignment here.
1032 if (!isUInt<16>(AM.BaseOffs))
1033 return false;
1034
1035 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1036 return true;
1037
1038 if (AM.Scale == 1 && AM.HasBaseReg)
1039 return true;
1040
1041 return false;
1042 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1043 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1044 // For an unknown address space, this usually means that this is for some
1045 // reason being used for pure arithmetic, and not based on some addressing
1046 // computation. We don't have instructions that compute pointers with any
1047 // addressing modes, so treat them as having no offset like flat
1048 // instructions.
1049 return isLegalFlatAddressingMode(AM);
1050 } else {
1051 llvm_unreachable("unhandled address space")::llvm::llvm_unreachable_internal("unhandled address space", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1051)
;
1052 }
1053}
1054
1055bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1056 const SelectionDAG &DAG) const {
1057 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1058 return (MemVT.getSizeInBits() <= 4 * 32);
1059 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1060 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1061 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1062 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1063 return (MemVT.getSizeInBits() <= 2 * 32);
1064 }
1065 return true;
1066}
1067
1068bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1069 unsigned AddrSpace,
1070 unsigned Align,
1071 bool *IsFast) const {
1072 if (IsFast)
1073 *IsFast = false;
1074
1075 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1076 // which isn't a simple VT.
1077 // Until MVT is extended to handle this, simply check for the size and
1078 // rely on the condition below: allow accesses if the size is a multiple of 4.
1079 if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1080 VT.getStoreSize() > 16)) {
1081 return false;
1082 }
1083
1084 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1085 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1086 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1087 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1088 // with adjacent offsets.
1089 bool AlignedBy4 = (Align % 4 == 0);
1090 if (IsFast)
1091 *IsFast = AlignedBy4;
1092
1093 return AlignedBy4;
1094 }
1095
1096 // FIXME: We have to be conservative here and assume that flat operations
1097 // will access scratch. If we had access to the IR function, then we
1098 // could determine if any private memory was used in the function.
1099 if (!Subtarget->hasUnalignedScratchAccess() &&
1100 (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1101 AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1102 bool AlignedBy4 = Align >= 4;
1103 if (IsFast)
1104 *IsFast = AlignedBy4;
1105
1106 return AlignedBy4;
1107 }
1108
1109 if (Subtarget->hasUnalignedBufferAccess()) {
1110 // If we have an uniform constant load, it still requires using a slow
1111 // buffer instruction if unaligned.
1112 if (IsFast) {
1113 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1114 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1115 (Align % 4 == 0) : true;
1116 }
1117
1118 return true;
1119 }
1120
1121 // Smaller than dword value must be aligned.
1122 if (VT.bitsLT(MVT::i32))
1123 return false;
1124
1125 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1126 // byte-address are ignored, thus forcing Dword alignment.
1127 // This applies to private, global, and constant memory.
1128 if (IsFast)
1129 *IsFast = true;
1130
1131 return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1132}
1133
1134EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
1135 unsigned SrcAlign, bool IsMemset,
1136 bool ZeroMemset,
1137 bool MemcpyStrSrc,
1138 MachineFunction &MF) const {
1139 // FIXME: Should account for address space here.
1140
1141 // The default fallback uses the private pointer size as a guess for a type to
1142 // use. Make sure we switch these to 64-bit accesses.
1143
1144 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1145 return MVT::v4i32;
1146
1147 if (Size >= 8 && DstAlign >= 4)
1148 return MVT::v2i32;
1149
1150 // Use the default.
1151 return MVT::Other;
1152}
1153
1154static bool isFlatGlobalAddrSpace(unsigned AS) {
1155 return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1156 AS == AMDGPUAS::FLAT_ADDRESS ||
1157 AS == AMDGPUAS::CONSTANT_ADDRESS;
1158}
1159
1160bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1161 unsigned DestAS) const {
1162 return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1163}
1164
1165bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1166 const MemSDNode *MemNode = cast<MemSDNode>(N);
1167 const Value *Ptr = MemNode->getMemOperand()->getValue();
1168 const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1169 return I && I->getMetadata("amdgpu.noclobber");
1170}
1171
1172bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
1173 unsigned DestAS) const {
1174 // Flat -> private/local is a simple truncate.
1175 // Flat -> global is no-op
1176 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1177 return true;
1178
1179 return isNoopAddrSpaceCast(SrcAS, DestAS);
1180}
1181
1182bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1183 const MemSDNode *MemNode = cast<MemSDNode>(N);
1184
1185 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1186}
1187
1188TargetLoweringBase::LegalizeTypeAction
1189SITargetLowering::getPreferredVectorAction(EVT VT) const {
1190 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1191 return TypeSplitVector;
1192
1193 return TargetLoweringBase::getPreferredVectorAction(VT);
1194}
1195
1196bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1197 Type *Ty) const {
1198 // FIXME: Could be smarter if called for vector constants.
1199 return true;
1200}
1201
1202bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1203 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1204 switch (Op) {
1205 case ISD::LOAD:
1206 case ISD::STORE:
1207
1208 // These operations are done with 32-bit instructions anyway.
1209 case ISD::AND:
1210 case ISD::OR:
1211 case ISD::XOR:
1212 case ISD::SELECT:
1213 // TODO: Extensions?
1214 return true;
1215 default:
1216 return false;
1217 }
1218 }
1219
1220 // SimplifySetCC uses this function to determine whether or not it should
1221 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1222 if (VT == MVT::i1 && Op == ISD::SETCC)
1223 return false;
1224
1225 return TargetLowering::isTypeDesirableForOp(Op, VT);
1226}
1227
1228SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1229 const SDLoc &SL,
1230 SDValue Chain,
1231 uint64_t Offset) const {
1232 const DataLayout &DL = DAG.getDataLayout();
1233 MachineFunction &MF = DAG.getMachineFunction();
1234 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1235
1236 const ArgDescriptor *InputPtrReg;
1237 const TargetRegisterClass *RC;
1238
1239 std::tie(InputPtrReg, RC)
1240 = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1241
1242 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1243 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1244 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1245 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1246
1247 return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1248}
1249
1250SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1251 const SDLoc &SL) const {
1252 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1253 FIRST_IMPLICIT);
1254 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1255}
1256
1257SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1258 const SDLoc &SL, SDValue Val,
1259 bool Signed,
1260 const ISD::InputArg *Arg) const {
1261 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1262 VT.bitsLT(MemVT)) {
1263 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1264 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1265 }
1266
1267 if (MemVT.isFloatingPoint())
1268 Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1269 else if (Signed)
1270 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1271 else
1272 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1273
1274 return Val;
1275}
1276
1277SDValue SITargetLowering::lowerKernargMemParameter(
1278 SelectionDAG &DAG, EVT VT, EVT MemVT,
1279 const SDLoc &SL, SDValue Chain,
1280 uint64_t Offset, unsigned Align, bool Signed,
1281 const ISD::InputArg *Arg) const {
1282 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1283 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
1284 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1285
1286 // Try to avoid using an extload by loading earlier than the argument address,
1287 // and extracting the relevant bits. The load should hopefully be merged with
1288 // the previous argument.
1289 if (MemVT.getStoreSize() < 4 && Align < 4) {
1290 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1291 int64_t AlignDownOffset = alignDown(Offset, 4);
1292 int64_t OffsetDiff = Offset - AlignDownOffset;
1293
1294 EVT IntVT = MemVT.changeTypeToInteger();
1295
1296 // TODO: If we passed in the base kernel offset we could have a better
1297 // alignment than 4, but we don't really need it.
1298 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1299 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1300 MachineMemOperand::MODereferenceable |
1301 MachineMemOperand::MOInvariant);
1302
1303 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1304 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1305
1306 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1307 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1308 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1309
1310
1311 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1312 }
1313
1314 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1315 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1316 MachineMemOperand::MODereferenceable |
1317 MachineMemOperand::MOInvariant);
1318
1319 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1320 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1321}
1322
1323SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1324 const SDLoc &SL, SDValue Chain,
1325 const ISD::InputArg &Arg) const {
1326 MachineFunction &MF = DAG.getMachineFunction();
1327 MachineFrameInfo &MFI = MF.getFrameInfo();
1328
1329 if (Arg.Flags.isByVal()) {
1330 unsigned Size = Arg.Flags.getByValSize();
1331 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1332 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1333 }
1334
1335 unsigned ArgOffset = VA.getLocMemOffset();
1336 unsigned ArgSize = VA.getValVT().getStoreSize();
1337
1338 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1339
1340 // Create load nodes to retrieve arguments from the stack.
1341 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1342 SDValue ArgValue;
1343
1344 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1345 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1346 MVT MemVT = VA.getValVT();
1347
1348 switch (VA.getLocInfo()) {
1349 default:
1350 break;
1351 case CCValAssign::BCvt:
1352 MemVT = VA.getLocVT();
1353 break;
1354 case CCValAssign::SExt:
1355 ExtType = ISD::SEXTLOAD;
1356 break;
1357 case CCValAssign::ZExt:
1358 ExtType = ISD::ZEXTLOAD;
1359 break;
1360 case CCValAssign::AExt:
1361 ExtType = ISD::EXTLOAD;
1362 break;
1363 }
1364
1365 ArgValue = DAG.getExtLoad(
1366 ExtType, SL, VA.getLocVT(), Chain, FIN,
1367 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1368 MemVT);
1369 return ArgValue;
1370}
1371
1372SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1373 const SIMachineFunctionInfo &MFI,
1374 EVT VT,
1375 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1376 const ArgDescriptor *Reg;
1377 const TargetRegisterClass *RC;
1378
1379 std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1380 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1381}
1382
1383static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1384 CallingConv::ID CallConv,
1385 ArrayRef<ISD::InputArg> Ins,
1386 BitVector &Skipped,
1387 FunctionType *FType,
1388 SIMachineFunctionInfo *Info) {
1389 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1390 const ISD::InputArg *Arg = &Ins[I];
1391
1392 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&(((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits()
== 16) && "vector type argument should have been split"
) ? static_cast<void> (0) : __assert_fail ("(!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && \"vector type argument should have been split\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1393, __PRETTY_FUNCTION__))
1393 "vector type argument should have been split")(((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits()
== 16) && "vector type argument should have been split"
) ? static_cast<void> (0) : __assert_fail ("(!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && \"vector type argument should have been split\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1393, __PRETTY_FUNCTION__))
;
1394
1395 // First check if it's a PS input addr.
1396 if (CallConv == CallingConv::AMDGPU_PS &&
1397 !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1398
1399 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1400
1401 // Inconveniently only the first part of the split is marked as isSplit,
1402 // so skip to the end. We only want to increment PSInputNum once for the
1403 // entire split argument.
1404 if (Arg->Flags.isSplit()) {
1405 while (!Arg->Flags.isSplitEnd()) {
1406 assert(!Arg->VT.isVector() &&((!Arg->VT.isVector() && "unexpected vector split in ps argument type"
) ? static_cast<void> (0) : __assert_fail ("!Arg->VT.isVector() && \"unexpected vector split in ps argument type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1407, __PRETTY_FUNCTION__))
1407 "unexpected vector split in ps argument type")((!Arg->VT.isVector() && "unexpected vector split in ps argument type"
) ? static_cast<void> (0) : __assert_fail ("!Arg->VT.isVector() && \"unexpected vector split in ps argument type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1407, __PRETTY_FUNCTION__))
;
1408 if (!SkipArg)
1409 Splits.push_back(*Arg);
1410 Arg = &Ins[++I];
1411 }
1412 }
1413
1414 if (SkipArg) {
1415 // We can safely skip PS inputs.
1416 Skipped.set(Arg->getOrigArgIndex());
1417 ++PSInputNum;
1418 continue;
1419 }
1420
1421 Info->markPSInputAllocated(PSInputNum);
1422 if (Arg->Used)
1423 Info->markPSInputEnabled(PSInputNum);
1424
1425 ++PSInputNum;
1426 }
1427
1428 Splits.push_back(*Arg);
1429 }
1430}
1431
1432// Allocate special inputs passed in VGPRs.
1433static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1434 MachineFunction &MF,
1435 const SIRegisterInfo &TRI,
1436 SIMachineFunctionInfo &Info) {
1437 if (Info.hasWorkItemIDX()) {
1438 unsigned Reg = AMDGPU::VGPR0;
1439 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1440
1441 CCInfo.AllocateReg(Reg);
1442 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1443 }
1444
1445 if (Info.hasWorkItemIDY()) {
1446 unsigned Reg = AMDGPU::VGPR1;
1447 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1448
1449 CCInfo.AllocateReg(Reg);
1450 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1451 }
1452
1453 if (Info.hasWorkItemIDZ()) {
1454 unsigned Reg = AMDGPU::VGPR2;
1455 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1456
1457 CCInfo.AllocateReg(Reg);
1458 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1459 }
1460}
1461
1462// Try to allocate a VGPR at the end of the argument list, or if no argument
1463// VGPRs are left allocating a stack slot.
1464static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1465 ArrayRef<MCPhysReg> ArgVGPRs
1466 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1467 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1468 if (RegIdx == ArgVGPRs.size()) {
1469 // Spill to stack required.
1470 int64_t Offset = CCInfo.AllocateStack(4, 4);
1471
1472 return ArgDescriptor::createStack(Offset);
1473 }
1474
1475 unsigned Reg = ArgVGPRs[RegIdx];
1476 Reg = CCInfo.AllocateReg(Reg);
1477 assert(Reg != AMDGPU::NoRegister)((Reg != AMDGPU::NoRegister) ? static_cast<void> (0) : __assert_fail
("Reg != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1477, __PRETTY_FUNCTION__))
;
1478
1479 MachineFunction &MF = CCInfo.getMachineFunction();
1480 MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1481 return ArgDescriptor::createRegister(Reg);
1482}
1483
1484static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1485 const TargetRegisterClass *RC,
1486 unsigned NumArgRegs) {
1487 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1488 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1489 if (RegIdx == ArgSGPRs.size())
1490 report_fatal_error("ran out of SGPRs for arguments");
1491
1492 unsigned Reg = ArgSGPRs[RegIdx];
1493 Reg = CCInfo.AllocateReg(Reg);
1494 assert(Reg != AMDGPU::NoRegister)((Reg != AMDGPU::NoRegister) ? static_cast<void> (0) : __assert_fail
("Reg != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1494, __PRETTY_FUNCTION__))
;
1495
1496 MachineFunction &MF = CCInfo.getMachineFunction();
1497 MF.addLiveIn(Reg, RC);
1498 return ArgDescriptor::createRegister(Reg);
1499}
1500
1501static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1502 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1503}
1504
1505static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1506 return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1507}
1508
1509static void allocateSpecialInputVGPRs(CCState &CCInfo,
1510 MachineFunction &MF,
1511 const SIRegisterInfo &TRI,
1512 SIMachineFunctionInfo &Info) {
1513 if (Info.hasWorkItemIDX())
1514 Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1515
1516 if (Info.hasWorkItemIDY())
1517 Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1518
1519 if (Info.hasWorkItemIDZ())
1520 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1521}
1522
1523static void allocateSpecialInputSGPRs(CCState &CCInfo,
1524 MachineFunction &MF,
1525 const SIRegisterInfo &TRI,
1526 SIMachineFunctionInfo &Info) {
1527 auto &ArgInfo = Info.getArgInfo();
1528
1529 // TODO: Unify handling with private memory pointers.
1530
1531 if (Info.hasDispatchPtr())
1532 ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1533
1534 if (Info.hasQueuePtr())
1535 ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1536
1537 if (Info.hasKernargSegmentPtr())
1538 ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1539
1540 if (Info.hasDispatchID())
1541 ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1542
1543 // flat_scratch_init is not applicable for non-kernel functions.
1544
1545 if (Info.hasWorkGroupIDX())
1546 ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1547
1548 if (Info.hasWorkGroupIDY())
1549 ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1550
1551 if (Info.hasWorkGroupIDZ())
1552 ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1553
1554 if (Info.hasImplicitArgPtr())
1555 ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1556}
1557
1558// Allocate special inputs passed in user SGPRs.
1559static void allocateHSAUserSGPRs(CCState &CCInfo,
1560 MachineFunction &MF,
1561 const SIRegisterInfo &TRI,
1562 SIMachineFunctionInfo &Info) {
1563 if (Info.hasImplicitBufferPtr()) {
1564 unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1565 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1566 CCInfo.AllocateReg(ImplicitBufferPtrReg);
1567 }
1568
1569 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1570 if (Info.hasPrivateSegmentBuffer()) {
1571 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1572 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1573 CCInfo.AllocateReg(PrivateSegmentBufferReg);
1574 }
1575
1576 if (Info.hasDispatchPtr()) {
1577 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1578 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1579 CCInfo.AllocateReg(DispatchPtrReg);
1580 }
1581
1582 if (Info.hasQueuePtr()) {
1583 unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1584 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1585 CCInfo.AllocateReg(QueuePtrReg);
1586 }
1587
1588 if (Info.hasKernargSegmentPtr()) {
1589 unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1590 MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1591 CCInfo.AllocateReg(InputPtrReg);
1592 }
1593
1594 if (Info.hasDispatchID()) {
1595 unsigned DispatchIDReg = Info.addDispatchID(TRI);
1596 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1597 CCInfo.AllocateReg(DispatchIDReg);
1598 }
1599
1600 if (Info.hasFlatScratchInit()) {
1601 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1602 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1603 CCInfo.AllocateReg(FlatScratchInitReg);
1604 }
1605
1606 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1607 // these from the dispatch pointer.
1608}
1609
1610// Allocate special input registers that are initialized per-wave.
1611static void allocateSystemSGPRs(CCState &CCInfo,
1612 MachineFunction &MF,
1613 SIMachineFunctionInfo &Info,
1614 CallingConv::ID CallConv,
1615 bool IsShader) {
1616 if (Info.hasWorkGroupIDX()) {
1617 unsigned Reg = Info.addWorkGroupIDX();
1618 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1619 CCInfo.AllocateReg(Reg);
1620 }
1621
1622 if (Info.hasWorkGroupIDY()) {
1623 unsigned Reg = Info.addWorkGroupIDY();
1624 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1625 CCInfo.AllocateReg(Reg);
1626 }
1627
1628 if (Info.hasWorkGroupIDZ()) {
1629 unsigned Reg = Info.addWorkGroupIDZ();
1630 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1631 CCInfo.AllocateReg(Reg);
1632 }
1633
1634 if (Info.hasWorkGroupInfo()) {
1635 unsigned Reg = Info.addWorkGroupInfo();
1636 MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1637 CCInfo.AllocateReg(Reg);
1638 }
1639
1640 if (Info.hasPrivateSegmentWaveByteOffset()) {
1641 // Scratch wave offset passed in system SGPR.
1642 unsigned PrivateSegmentWaveByteOffsetReg;
1643
1644 if (IsShader) {
1645 PrivateSegmentWaveByteOffsetReg =
1646 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1647
1648 // This is true if the scratch wave byte offset doesn't have a fixed
1649 // location.
1650 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1651 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1652 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1653 }
1654 } else
1655 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1656
1657 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1658 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1659 }
1660}
1661
1662static void reservePrivateMemoryRegs(const TargetMachine &TM,
1663 MachineFunction &MF,
1664 const SIRegisterInfo &TRI,
1665 SIMachineFunctionInfo &Info) {
1666 // Now that we've figured out where the scratch register inputs are, see if
1667 // should reserve the arguments and use them directly.
1668 MachineFrameInfo &MFI = MF.getFrameInfo();
1669 bool HasStackObjects = MFI.hasStackObjects();
1670
1671 // Record that we know we have non-spill stack objects so we don't need to
1672 // check all stack objects later.
1673 if (HasStackObjects)
1674 Info.setHasNonSpillStackObjects(true);
1675
1676 // Everything live out of a block is spilled with fast regalloc, so it's
1677 // almost certain that spilling will be required.
1678 if (TM.getOptLevel() == CodeGenOpt::None)
1679 HasStackObjects = true;
1680
1681 // For now assume stack access is needed in any callee functions, so we need
1682 // the scratch registers to pass in.
1683 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1684
1685 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1686 if (ST.isAmdHsaOrMesa(MF.getFunction())) {
1687 if (RequiresStackAccess) {
1688 // If we have stack objects, we unquestionably need the private buffer
1689 // resource. For the Code Object V2 ABI, this will be the first 4 user
1690 // SGPR inputs. We can reserve those and use them directly.
1691
1692 unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1693 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1694 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1695
1696 if (MFI.hasCalls()) {
1697 // If we have calls, we need to keep the frame register in a register
1698 // that won't be clobbered by a call, so ensure it is copied somewhere.
1699
1700 // This is not a problem for the scratch wave offset, because the same
1701 // registers are reserved in all functions.
1702
1703 // FIXME: Nothing is really ensuring this is a call preserved register,
1704 // it's just selected from the end so it happens to be.
1705 unsigned ReservedOffsetReg
1706 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1707 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1708 } else {
1709 unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1710 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1711 Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1712 }
1713 } else {
1714 unsigned ReservedBufferReg
1715 = TRI.reservedPrivateSegmentBufferReg(MF);
1716 unsigned ReservedOffsetReg
1717 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1718
1719 // We tentatively reserve the last registers (skipping the last two
1720 // which may contain VCC). After register allocation, we'll replace
1721 // these with the ones immediately after those which were really
1722 // allocated. In the prologue copies will be inserted from the argument
1723 // to these reserved registers.
1724 Info.setScratchRSrcReg(ReservedBufferReg);
1725 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1726 }
1727 } else {
1728 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1729
1730 // Without HSA, relocations are used for the scratch pointer and the
1731 // buffer resource setup is always inserted in the prologue. Scratch wave
1732 // offset is still in an input SGPR.
1733 Info.setScratchRSrcReg(ReservedBufferReg);
1734
1735 if (HasStackObjects && !MFI.hasCalls()) {
1736 unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1737 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1738 Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1739 } else {
1740 unsigned ReservedOffsetReg
1741 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1742 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1743 }
1744 }
1745}
1746
1747bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1748 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1749 return !Info->isEntryFunction();
1750}
1751
1752void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1753
1754}
1755
1756void SITargetLowering::insertCopiesSplitCSR(
1757 MachineBasicBlock *Entry,
1758 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1759 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1760
1761 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1762 if (!IStart)
1763 return;
1764
1765 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1766 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1767 MachineBasicBlock::iterator MBBI = Entry->begin();
1768 for (const MCPhysReg *I = IStart; *I; ++I) {
1769 const TargetRegisterClass *RC = nullptr;
1770 if (AMDGPU::SReg_64RegClass.contains(*I))
1771 RC = &AMDGPU::SGPR_64RegClass;
1772 else if (AMDGPU::SReg_32RegClass.contains(*I))
1773 RC = &AMDGPU::SGPR_32RegClass;
1774 else
1775 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1775)
;
1776
1777 unsigned NewVR = MRI->createVirtualRegister(RC);
1778 // Create copy from CSR to a virtual register.
1779 Entry->addLiveIn(*I);
1780 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1781 .addReg(*I);
1782
1783 // Insert the copy-back instructions right before the terminator.
1784 for (auto *Exit : Exits)
1785 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1786 TII->get(TargetOpcode::COPY), *I)
1787 .addReg(NewVR);
1788 }
1789}
1790
1791SDValue SITargetLowering::LowerFormalArguments(
1792 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1793 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1794 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1795 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1796
1797 MachineFunction &MF = DAG.getMachineFunction();
1798 const Function &Fn = MF.getFunction();
1799 FunctionType *FType = MF.getFunction().getFunctionType();
1800 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1801 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1802
1803 if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1804 DiagnosticInfoUnsupported NoGraphicsHSA(
1805 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1806 DAG.getContext()->diagnose(NoGraphicsHSA);
1807 return DAG.getEntryNode();
1808 }
1809
1810 // Create stack objects that are used for emitting debugger prologue if
1811 // "amdgpu-debugger-emit-prologue" attribute was specified.
1812 if (ST.debuggerEmitPrologue())
1813 createDebuggerPrologueStackObjects(MF);
1814
1815 SmallVector<ISD::InputArg, 16> Splits;
1816 SmallVector<CCValAssign, 16> ArgLocs;
1817 BitVector Skipped(Ins.size());
1818 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1819 *DAG.getContext());
1820
1821 bool IsShader = AMDGPU::isShader(CallConv);
1822 bool IsKernel = AMDGPU::isKernel(CallConv);
1823 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1824
1825 if (!IsEntryFunc) {
1826 // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1827 // this when allocating argument fixed offsets.
1828 CCInfo.AllocateStack(4, 4);
1829 }
1830
1831 if (IsShader) {
1832 processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1833
1834 // At least one interpolation mode must be enabled or else the GPU will
1835 // hang.
1836 //
1837 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1838 // set PSInputAddr, the user wants to enable some bits after the compilation
1839 // based on run-time states. Since we can't know what the final PSInputEna
1840 // will look like, so we shouldn't do anything here and the user should take
1841 // responsibility for the correct programming.
1842 //
1843 // Otherwise, the following restrictions apply:
1844 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1845 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1846 // enabled too.
1847 if (CallConv == CallingConv::AMDGPU_PS) {
1848 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1849 ((Info->getPSInputAddr() & 0xF) == 0 &&
1850 Info->isPSInputAllocated(11))) {
1851 CCInfo.AllocateReg(AMDGPU::VGPR0);
1852 CCInfo.AllocateReg(AMDGPU::VGPR1);
1853 Info->markPSInputAllocated(0);
1854 Info->markPSInputEnabled(0);
1855 }
1856 if (Subtarget->isAmdPalOS()) {
1857 // For isAmdPalOS, the user does not enable some bits after compilation
1858 // based on run-time states; the register values being generated here are
1859 // the final ones set in hardware. Therefore we need to apply the
1860 // workaround to PSInputAddr and PSInputEnable together. (The case where
1861 // a bit is set in PSInputAddr but not PSInputEnable is where the
1862 // frontend set up an input arg for a particular interpolation mode, but
1863 // nothing uses that input arg. Really we should have an earlier pass
1864 // that removes such an arg.)
1865 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1866 if ((PsInputBits & 0x7F) == 0 ||
1867 ((PsInputBits & 0xF) == 0 &&
1868 (PsInputBits >> 11 & 1)))
1869 Info->markPSInputEnabled(
1870 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1871 }
1872 }
1873
1874 assert(!Info->hasDispatchPtr() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1875 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1876 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1877 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1878 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
1879 !Info->hasWorkItemIDZ())((!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr
() && !Info->hasFlatScratchInit() && !Info
->hasWorkGroupIDX() && !Info->hasWorkGroupIDY()
&& !Info->hasWorkGroupIDZ() && !Info->
hasWorkGroupInfo() && !Info->hasWorkItemIDX() &&
!Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ
()) ? static_cast<void> (0) : __assert_fail ("!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1879, __PRETTY_FUNCTION__))
;
1880 } else if (IsKernel) {
1881 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX())((Info->hasWorkGroupIDX() && Info->hasWorkItemIDX
()) ? static_cast<void> (0) : __assert_fail ("Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1881, __PRETTY_FUNCTION__))
;
1882 } else {
1883 Splits.append(Ins.begin(), Ins.end());
1884 }
1885
1886 if (IsEntryFunc) {
1887 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1888 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1889 }
1890
1891 if (IsKernel) {
1892 analyzeFormalArgumentsCompute(CCInfo, Ins);
1893 } else {
1894 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1895 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1896 }
1897
1898 SmallVector<SDValue, 16> Chains;
1899
1900 // FIXME: This is the minimum kernel argument alignment. We should improve
1901 // this to the maximum alignment of the arguments.
1902 //
1903 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
1904 // kern arg offset.
1905 const unsigned KernelArgBaseAlign = 16;
1906
1907 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1908 const ISD::InputArg &Arg = Ins[i];
1909 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
1910 InVals.push_back(DAG.getUNDEF(Arg.VT));
1911 continue;
1912 }
1913
1914 CCValAssign &VA = ArgLocs[ArgIdx++];
1915 MVT VT = VA.getLocVT();
1916
1917 if (IsEntryFunc && VA.isMemLoc()) {
1918 VT = Ins[i].VT;
1919 EVT MemVT = VA.getLocVT();
1920
1921 const uint64_t Offset = VA.getLocMemOffset();
1922 unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
1923
1924 SDValue Arg = lowerKernargMemParameter(
1925 DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
1926 Chains.push_back(Arg.getValue(1));
1927
1928 auto *ParamTy =
1929 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1930 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1931 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1932 // On SI local pointers are just offsets into LDS, so they are always
1933 // less than 16-bits. On CI and newer they could potentially be
1934 // real pointers, so we can't guarantee their size.
1935 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1936 DAG.getValueType(MVT::i16));
1937 }
1938
1939 InVals.push_back(Arg);
1940 continue;
1941 } else if (!IsEntryFunc && VA.isMemLoc()) {
1942 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1943 InVals.push_back(Val);
1944 if (!Arg.Flags.isByVal())
1945 Chains.push_back(Val.getValue(1));
1946 continue;
1947 }
1948
1949 assert(VA.isRegLoc() && "Parameter must be in a register!")((VA.isRegLoc() && "Parameter must be in a register!"
) ? static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Parameter must be in a register!\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1949, __PRETTY_FUNCTION__))
;
1950
1951 unsigned Reg = VA.getLocReg();
1952 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1953 EVT ValVT = VA.getValVT();
1954
1955 Reg = MF.addLiveIn(Reg, RC);
1956 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1957
1958 if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
1959 // The return object should be reasonably addressable.
1960
1961 // FIXME: This helps when the return is a real sret. If it is a
1962 // automatically inserted sret (i.e. CanLowerReturn returns false), an
1963 // extra copy is inserted in SelectionDAGBuilder which obscures this.
1964 unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
1965 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1966 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
1967 }
1968
1969 // If this is an 8 or 16-bit value, it is really passed promoted
1970 // to 32 bits. Insert an assert[sz]ext to capture this, then
1971 // truncate to the right size.
1972 switch (VA.getLocInfo()) {
1973 case CCValAssign::Full:
1974 break;
1975 case CCValAssign::BCvt:
1976 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1977 break;
1978 case CCValAssign::SExt:
1979 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1980 DAG.getValueType(ValVT));
1981 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1982 break;
1983 case CCValAssign::ZExt:
1984 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1985 DAG.getValueType(ValVT));
1986 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1987 break;
1988 case CCValAssign::AExt:
1989 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1990 break;
1991 default:
1992 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 1992)
;
1993 }
1994
1995 InVals.push_back(Val);
1996 }
1997
1998 if (!IsEntryFunc) {
1999 // Special inputs come after user arguments.
2000 allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2001 }
2002
2003 // Start adding system SGPRs.
2004 if (IsEntryFunc) {
2005 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
2006 } else {
2007 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2008 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2009 CCInfo.AllocateReg(Info->getFrameOffsetReg());
2010 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2011 }
2012
2013 auto &ArgUsageInfo =
2014 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2015 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2016
2017 unsigned StackArgSize = CCInfo.getNextStackOffset();
2018 Info->setBytesInStackArgArea(StackArgSize);
2019
2020 return Chains.empty() ? Chain :
2021 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2022}
2023
2024// TODO: If return values can't fit in registers, we should return as many as
2025// possible in registers before passing on stack.
2026bool SITargetLowering::CanLowerReturn(
2027 CallingConv::ID CallConv,
2028 MachineFunction &MF, bool IsVarArg,
2029 const SmallVectorImpl<ISD::OutputArg> &Outs,
2030 LLVMContext &Context) const {
2031 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2032 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2033 // for shaders. Vector types should be explicitly handled by CC.
2034 if (AMDGPU::isEntryFunctionCC(CallConv))
2035 return true;
2036
2037 SmallVector<CCValAssign, 16> RVLocs;
2038 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2039 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2040}
2041
2042SDValue
2043SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2044 bool isVarArg,
2045 const SmallVectorImpl<ISD::OutputArg> &Outs,
2046 const SmallVectorImpl<SDValue> &OutVals,
2047 const SDLoc &DL, SelectionDAG &DAG) const {
2048 MachineFunction &MF = DAG.getMachineFunction();
2049 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2050
2051 if (AMDGPU::isKernel(CallConv)) {
2052 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2053 OutVals, DL, DAG);
2054 }
2055
2056 bool IsShader = AMDGPU::isShader(CallConv);
2057
2058 Info->setIfReturnsVoid(Outs.empty());
2059 bool IsWaveEnd = Info->returnsVoid() && IsShader;
2060
2061 // CCValAssign - represent the assignment of the return value to a location.
2062 SmallVector<CCValAssign, 48> RVLocs;
2063 SmallVector<ISD::OutputArg, 48> Splits;
2064
2065 // CCState - Info about the registers and stack slots.
2066 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2067 *DAG.getContext());
2068
2069 // Analyze outgoing return values.
2070 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2071
2072 SDValue Flag;
2073 SmallVector<SDValue, 48> RetOps;
2074 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2075
2076 // Add return address for callable functions.
2077 if (!Info->isEntryFunction()) {
2078 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2079 SDValue ReturnAddrReg = CreateLiveInRegister(
2080 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2081
2082 // FIXME: Should be able to use a vreg here, but need a way to prevent it
2083 // from being allcoated to a CSR.
2084
2085 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2086 MVT::i64);
2087
2088 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2089 Flag = Chain.getValue(1);
2090
2091 RetOps.push_back(PhysReturnAddrReg);
2092 }
2093
2094 // Copy the result values into the output registers.
2095 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2096 ++I, ++RealRVLocIdx) {
2097 CCValAssign &VA = RVLocs[I];
2098 assert(VA.isRegLoc() && "Can only return in registers!")((VA.isRegLoc() && "Can only return in registers!") ?
static_cast<void> (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2098, __PRETTY_FUNCTION__))
;
2099 // TODO: Partially return in registers if return values don't fit.
2100 SDValue Arg = OutVals[RealRVLocIdx];
2101
2102 // Copied from other backends.
2103 switch (VA.getLocInfo()) {
2104 case CCValAssign::Full:
2105 break;
2106 case CCValAssign::BCvt:
2107 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2108 break;
2109 case CCValAssign::SExt:
2110 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2111 break;
2112 case CCValAssign::ZExt:
2113 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2114 break;
2115 case CCValAssign::AExt:
2116 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2117 break;
2118 default:
2119 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2119)
;
2120 }
2121
2122 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2123 Flag = Chain.getValue(1);
2124 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2125 }
2126
2127 // FIXME: Does sret work properly?
2128 if (!Info->isEntryFunction()) {
2129 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2130 const MCPhysReg *I =
2131 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2132 if (I) {
2133 for (; *I; ++I) {
2134 if (AMDGPU::SReg_64RegClass.contains(*I))
2135 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2136 else if (AMDGPU::SReg_32RegClass.contains(*I))
2137 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2138 else
2139 llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2139)
;
2140 }
2141 }
2142 }
2143
2144 // Update chain and glue.
2145 RetOps[0] = Chain;
2146 if (Flag.getNode())
2147 RetOps.push_back(Flag);
2148
2149 unsigned Opc = AMDGPUISD::ENDPGM;
2150 if (!IsWaveEnd)
2151 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
2152 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2153}
2154
2155SDValue SITargetLowering::LowerCallResult(
2156 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2157 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2158 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2159 SDValue ThisVal) const {
2160 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2161
2162 // Assign locations to each value returned by this call.
2163 SmallVector<CCValAssign, 16> RVLocs;
2164 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2165 *DAG.getContext());
2166 CCInfo.AnalyzeCallResult(Ins, RetCC);
2167
2168 // Copy all of the result registers out of their specified physreg.
2169 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2170 CCValAssign VA = RVLocs[i];
2171 SDValue Val;
2172
2173 if (VA.isRegLoc()) {
2174 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2175 Chain = Val.getValue(1);
2176 InFlag = Val.getValue(2);
2177 } else if (VA.isMemLoc()) {
2178 report_fatal_error("TODO: return values in memory");
2179 } else
2180 llvm_unreachable("unknown argument location type")::llvm::llvm_unreachable_internal("unknown argument location type"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2180)
;
2181
2182 switch (VA.getLocInfo()) {
2183 case CCValAssign::Full:
2184 break;
2185 case CCValAssign::BCvt:
2186 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2187 break;
2188 case CCValAssign::ZExt:
2189 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2190 DAG.getValueType(VA.getValVT()));
2191 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2192 break;
2193 case CCValAssign::SExt:
2194 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2195 DAG.getValueType(VA.getValVT()));
2196 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2197 break;
2198 case CCValAssign::AExt:
2199 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2200 break;
2201 default:
2202 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2202)
;
2203 }
2204
2205 InVals.push_back(Val);
2206 }
2207
2208 return Chain;
2209}
2210
2211// Add code to pass special inputs required depending on used features separate
2212// from the explicit user arguments present in the IR.
2213void SITargetLowering::passSpecialInputs(
2214 CallLoweringInfo &CLI,
2215 CCState &CCInfo,
2216 const SIMachineFunctionInfo &Info,
2217 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2218 SmallVectorImpl<SDValue> &MemOpChains,
2219 SDValue Chain) const {
2220 // If we don't have a call site, this was a call inserted by
2221 // legalization. These can never use special inputs.
2222 if (!CLI.CS)
2223 return;
2224
2225 const Function *CalleeFunc = CLI.CS.getCalledFunction();
2226 assert(CalleeFunc)((CalleeFunc) ? static_cast<void> (0) : __assert_fail (
"CalleeFunc", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2226, __PRETTY_FUNCTION__))
;
2227
2228 SelectionDAG &DAG = CLI.DAG;
2229 const SDLoc &DL = CLI.DL;
2230
2231 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2232
2233 auto &ArgUsageInfo =
2234 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2235 const AMDGPUFunctionArgInfo &CalleeArgInfo
2236 = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2237
2238 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2239
2240 // TODO: Unify with private memory register handling. This is complicated by
2241 // the fact that at least in kernels, the input argument is not necessarily
2242 // in the same location as the input.
2243 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2244 AMDGPUFunctionArgInfo::DISPATCH_PTR,
2245 AMDGPUFunctionArgInfo::QUEUE_PTR,
2246 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2247 AMDGPUFunctionArgInfo::DISPATCH_ID,
2248 AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2249 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2250 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2251 AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2252 AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
2253 AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2254 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2255 };
2256
2257 for (auto InputID : InputRegs) {
2258 const ArgDescriptor *OutgoingArg;
2259 const TargetRegisterClass *ArgRC;
2260
2261 std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2262 if (!OutgoingArg)
2263 continue;
2264
2265 const ArgDescriptor *IncomingArg;
2266 const TargetRegisterClass *IncomingArgRC;
2267 std::tie(IncomingArg, IncomingArgRC)
2268 = CallerArgInfo.getPreloadedValue(InputID);
2269 assert(IncomingArgRC == ArgRC)((IncomingArgRC == ArgRC) ? static_cast<void> (0) : __assert_fail
("IncomingArgRC == ArgRC", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2269, __PRETTY_FUNCTION__))
;
2270
2271 // All special arguments are ints for now.
2272 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2273 SDValue InputReg;
2274
2275 if (IncomingArg) {
2276 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2277 } else {
2278 // The implicit arg ptr is special because it doesn't have a corresponding
2279 // input for kernels, and is computed from the kernarg segment pointer.
2280 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR)((InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) ? static_cast
<void> (0) : __assert_fail ("InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2280, __PRETTY_FUNCTION__))
;
2281 InputReg = getImplicitArgPtr(DAG, DL);
2282 }
2283
2284 if (OutgoingArg->isRegister()) {
2285 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2286 } else {
2287 unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2288 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2289 SpecialArgOffset);
2290 MemOpChains.push_back(ArgStore);
2291 }
2292 }
2293}
2294
2295static bool canGuaranteeTCO(CallingConv::ID CC) {
2296 return CC == CallingConv::Fast;
2297}
2298
2299/// Return true if we might ever do TCO for calls with this calling convention.
2300static bool mayTailCallThisCC(CallingConv::ID CC) {
2301 switch (CC) {
2302 case CallingConv::C:
2303 return true;
2304 default:
2305 return canGuaranteeTCO(CC);
2306 }
2307}
2308
2309bool SITargetLowering::isEligibleForTailCallOptimization(
2310 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2311 const SmallVectorImpl<ISD::OutputArg> &Outs,
2312 const SmallVectorImpl<SDValue> &OutVals,
2313 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2314 if (!mayTailCallThisCC(CalleeCC))
2315 return false;
2316
2317 MachineFunction &MF = DAG.getMachineFunction();
2318 const Function &CallerF = MF.getFunction();
2319 CallingConv::ID CallerCC = CallerF.getCallingConv();
2320 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2321 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2322
2323 // Kernels aren't callable, and don't have a live in return address so it
2324 // doesn't make sense to do a tail call with entry functions.
2325 if (!CallerPreserved)
2326 return false;
2327
2328 bool CCMatch = CallerCC == CalleeCC;
2329
2330 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2331 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2332 return true;
2333 return false;
2334 }
2335
2336 // TODO: Can we handle var args?
2337 if (IsVarArg)
2338 return false;
2339
2340 for (const Argument &Arg : CallerF.args()) {
2341 if (Arg.hasByValAttr())
2342 return false;
2343 }
2344
2345 LLVMContext &Ctx = *DAG.getContext();
2346
2347 // Check that the call results are passed in the same way.
2348 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2349 CCAssignFnForCall(CalleeCC, IsVarArg),
2350 CCAssignFnForCall(CallerCC, IsVarArg)))
2351 return false;
2352
2353 // The callee has to preserve all registers the caller needs to preserve.
2354 if (!CCMatch) {
2355 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2356 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2357 return false;
2358 }
2359
2360 // Nothing more to check if the callee is taking no arguments.
2361 if (Outs.empty())
2362 return true;
2363
2364 SmallVector<CCValAssign, 16> ArgLocs;
2365 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2366
2367 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2368
2369 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2370 // If the stack arguments for this call do not fit into our own save area then
2371 // the call cannot be made tail.
2372 // TODO: Is this really necessary?
2373 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2374 return false;
2375
2376 const MachineRegisterInfo &MRI = MF.getRegInfo();
2377 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2378}
2379
2380bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2381 if (!CI->isTailCall())
2382 return false;
2383
2384 const Function *ParentFn = CI->getParent()->getParent();
2385 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2386 return false;
2387
2388 auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2389 return (Attr.getValueAsString() != "true");
2390}
2391
2392// The wave scratch offset register is used as the global base pointer.
2393SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2394 SmallVectorImpl<SDValue> &InVals) const {
2395 SelectionDAG &DAG = CLI.DAG;
2396 const SDLoc &DL = CLI.DL;
2397 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2398 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2399 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2400 SDValue Chain = CLI.Chain;
2401 SDValue Callee = CLI.Callee;
2402 bool &IsTailCall = CLI.IsTailCall;
2403 CallingConv::ID CallConv = CLI.CallConv;
2404 bool IsVarArg = CLI.IsVarArg;
2405 bool IsSibCall = false;
2406 bool IsThisReturn = false;
2407 MachineFunction &MF = DAG.getMachineFunction();
2408
2409 if (IsVarArg) {
2410 return lowerUnhandledCall(CLI, InVals,
2411 "unsupported call to variadic function ");
2412 }
2413
2414 if (!CLI.CS.getInstruction())
2415 report_fatal_error("unsupported libcall legalization");
2416
2417 if (!CLI.CS.getCalledFunction()) {
2418 return lowerUnhandledCall(CLI, InVals,
2419 "unsupported indirect call to function ");
2420 }
2421
2422 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2423 return lowerUnhandledCall(CLI, InVals,
2424 "unsupported required tail call to function ");
2425 }
2426
2427 if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2428 // Note the issue is with the CC of the calling function, not of the call
2429 // itself.
2430 return lowerUnhandledCall(CLI, InVals,
2431 "unsupported call from graphics shader of function ");
2432 }
2433
2434 // The first 4 bytes are reserved for the callee's emergency stack slot.
2435 if (IsTailCall) {
2436 IsTailCall = isEligibleForTailCallOptimization(
2437 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2438 if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2439 report_fatal_error("failed to perform tail call elimination on a call "
2440 "site marked musttail");
2441 }
2442
2443 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2444
2445 // A sibling call is one where we're under the usual C ABI and not planning
2446 // to change that but can still do a tail call:
2447 if (!TailCallOpt && IsTailCall)
2448 IsSibCall = true;
2449
2450 if (IsTailCall)
2451 ++NumTailCalls;
2452 }
2453
2454 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2455
2456 // Analyze operands of the call, assigning locations to each operand.
2457 SmallVector<CCValAssign, 16> ArgLocs;
2458 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2459 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2460
2461 // The first 4 bytes are reserved for the callee's emergency stack slot.
2462 CCInfo.AllocateStack(4, 4);
2463
2464 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2465
2466 // Get a count of how many bytes are to be pushed on the stack.
2467 unsigned NumBytes = CCInfo.getNextStackOffset();
2468
2469 if (IsSibCall) {
2470 // Since we're not changing the ABI to make this a tail call, the memory
2471 // operands are already available in the caller's incoming argument space.
2472 NumBytes = 0;
2473 }
2474
2475 // FPDiff is the byte offset of the call's argument area from the callee's.
2476 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2477 // by this amount for a tail call. In a sibling call it must be 0 because the
2478 // caller will deallocate the entire stack and the callee still expects its
2479 // arguments to begin at SP+0. Completely unused for non-tail calls.
2480 int32_t FPDiff = 0;
2481 MachineFrameInfo &MFI = MF.getFrameInfo();
2482 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2483
2484 SDValue CallerSavedFP;
2485
2486 // Adjust the stack pointer for the new arguments...
2487 // These operations are automatically eliminated by the prolog/epilog pass
2488 if (!IsSibCall) {
2489 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2490
2491 unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2492
2493 // In the HSA case, this should be an identity copy.
2494 SDValue ScratchRSrcReg
2495 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2496 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2497
2498 // TODO: Don't hardcode these registers and get from the callee function.
2499 SDValue ScratchWaveOffsetReg
2500 = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2501 RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2502
2503 if (!Info->isEntryFunction()) {
2504 // Avoid clobbering this function's FP value. In the current convention
2505 // callee will overwrite this, so do save/restore around the call site.
2506 CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2507 Info->getFrameOffsetReg(), MVT::i32);
2508 }
2509 }
2510
2511 SmallVector<SDValue, 8> MemOpChains;
2512 MVT PtrVT = MVT::i32;
2513
2514 // Walk the register/memloc assignments, inserting copies/loads.
2515 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2516 ++i, ++realArgIdx) {
2517 CCValAssign &VA = ArgLocs[i];
2518 SDValue Arg = OutVals[realArgIdx];
2519
2520 // Promote the value if needed.
2521 switch (VA.getLocInfo()) {
2522 case CCValAssign::Full:
2523 break;
2524 case CCValAssign::BCvt:
2525 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2526 break;
2527 case CCValAssign::ZExt:
2528 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2529 break;
2530 case CCValAssign::SExt:
2531 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2532 break;
2533 case CCValAssign::AExt:
2534 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2535 break;
2536 case CCValAssign::FPExt:
2537 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2538 break;
2539 default:
2540 llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2540)
;
2541 }
2542
2543 if (VA.isRegLoc()) {
2544 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2545 } else {
2546 assert(VA.isMemLoc())((VA.isMemLoc()) ? static_cast<void> (0) : __assert_fail
("VA.isMemLoc()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2546, __PRETTY_FUNCTION__))
;
2547
2548 SDValue DstAddr;
2549 MachinePointerInfo DstInfo;
2550
2551 unsigned LocMemOffset = VA.getLocMemOffset();
2552 int32_t Offset = LocMemOffset;
2553
2554 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2555 unsigned Align = 0;
2556
2557 if (IsTailCall) {
2558 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2559 unsigned OpSize = Flags.isByVal() ?
2560 Flags.getByValSize() : VA.getValVT().getStoreSize();
2561
2562 // FIXME: We can have better than the minimum byval required alignment.
2563 Align = Flags.isByVal() ? Flags.getByValAlign() :
2564 MinAlign(Subtarget->getStackAlignment(), Offset);
2565
2566 Offset = Offset + FPDiff;
2567 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2568
2569 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2570 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2571
2572 // Make sure any stack arguments overlapping with where we're storing
2573 // are loaded before this eventual operation. Otherwise they'll be
2574 // clobbered.
2575
2576 // FIXME: Why is this really necessary? This seems to just result in a
2577 // lot of code to copy the stack and write them back to the same
2578 // locations, which are supposed to be immutable?
2579 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2580 } else {
2581 DstAddr = PtrOff;
2582 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2583 Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2584 }
2585
2586 if (Outs[i].Flags.isByVal()) {
2587 SDValue SizeNode =
2588 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2589 SDValue Cpy = DAG.getMemcpy(
2590 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2591 /*isVol = */ false, /*AlwaysInline = */ true,
2592 /*isTailCall = */ false, DstInfo,
2593 MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2594 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
2595
2596 MemOpChains.push_back(Cpy);
2597 } else {
2598 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2599 MemOpChains.push_back(Store);
2600 }
2601 }
2602 }
2603
2604 // Copy special input registers after user input arguments.
2605 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2606
2607 if (!MemOpChains.empty())
2608 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2609
2610 // Build a sequence of copy-to-reg nodes chained together with token chain
2611 // and flag operands which copy the outgoing args into the appropriate regs.
2612 SDValue InFlag;
2613 for (auto &RegToPass : RegsToPass) {
2614 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2615 RegToPass.second, InFlag);
2616 InFlag = Chain.getValue(1);
2617 }
2618
2619
2620 SDValue PhysReturnAddrReg;
2621 if (IsTailCall) {
2622 // Since the return is being combined with the call, we need to pass on the
2623 // return address.
2624
2625 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2626 SDValue ReturnAddrReg = CreateLiveInRegister(
2627 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2628
2629 PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2630 MVT::i64);
2631 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2632 InFlag = Chain.getValue(1);
2633 }
2634
2635 // We don't usually want to end the call-sequence here because we would tidy
2636 // the frame up *after* the call, however in the ABI-changing tail-call case
2637 // we've carefully laid out the parameters so that when sp is reset they'll be
2638 // in the correct location.
2639 if (IsTailCall && !IsSibCall) {
2640 Chain = DAG.getCALLSEQ_END(Chain,
2641 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2642 DAG.getTargetConstant(0, DL, MVT::i32),
2643 InFlag, DL);
2644 InFlag = Chain.getValue(1);
2645 }
2646
2647 std::vector<SDValue> Ops;
2648 Ops.push_back(Chain);
2649 Ops.push_back(Callee);
2650
2651 if (IsTailCall) {
2652 // Each tail call may have to adjust the stack by a different amount, so
2653 // this information must travel along with the operation for eventual
2654 // consumption by emitEpilogue.
2655 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2656
2657 Ops.push_back(PhysReturnAddrReg);
2658 }
2659
2660 // Add argument registers to the end of the list so that they are known live
2661 // into the call.
2662 for (auto &RegToPass : RegsToPass) {
2663 Ops.push_back(DAG.getRegister(RegToPass.first,
2664 RegToPass.second.getValueType()));
2665 }
2666
2667 // Add a register mask operand representing the call-preserved registers.
2668
2669 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2670 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2671 assert(Mask && "Missing call preserved mask for calling convention")((Mask && "Missing call preserved mask for calling convention"
) ? static_cast<void> (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2671, __PRETTY_FUNCTION__))
;
2672 Ops.push_back(DAG.getRegisterMask(Mask));
2673
2674 if (InFlag.getNode())
2675 Ops.push_back(InFlag);
2676
2677 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2678
2679 // If we're doing a tall call, use a TC_RETURN here rather than an
2680 // actual call instruction.
2681 if (IsTailCall) {
2682 MFI.setHasTailCall();
2683 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2684 }
2685
2686 // Returns a chain and a flag for retval copy to use.
2687 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2688 Chain = Call.getValue(0);
2689 InFlag = Call.getValue(1);
2690
2691 if (CallerSavedFP) {
2692 SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2693 Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2694 InFlag = Chain.getValue(1);
2695 }
2696
2697 uint64_t CalleePopBytes = NumBytes;
2698 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2699 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2700 InFlag, DL);
2701 if (!Ins.empty())
2702 InFlag = Chain.getValue(1);
2703
2704 // Handle result values, copying them out of physregs into vregs that we
2705 // return.
2706 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2707 InVals, IsThisReturn,
2708 IsThisReturn ? OutVals[0] : SDValue());
2709}
2710
2711unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2712 SelectionDAG &DAG) const {
2713 unsigned Reg = StringSwitch<unsigned>(RegName)
2714 .Case("m0", AMDGPU::M0)
2715 .Case("exec", AMDGPU::EXEC)
2716 .Case("exec_lo", AMDGPU::EXEC_LO)
2717 .Case("exec_hi", AMDGPU::EXEC_HI)
2718 .Case("flat_scratch", AMDGPU::FLAT_SCR)
2719 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2720 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2721 .Default(AMDGPU::NoRegister);
2722
2723 if (Reg == AMDGPU::NoRegister) {
2724 report_fatal_error(Twine("invalid register name \""
2725 + StringRef(RegName) + "\"."));
2726
2727 }
2728
2729 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2730 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2731 report_fatal_error(Twine("invalid register \""
2732 + StringRef(RegName) + "\" for subtarget."));
2733 }
2734
2735 switch (Reg) {
2736 case AMDGPU::M0:
2737 case AMDGPU::EXEC_LO:
2738 case AMDGPU::EXEC_HI:
2739 case AMDGPU::FLAT_SCR_LO:
2740 case AMDGPU::FLAT_SCR_HI:
2741 if (VT.getSizeInBits() == 32)
2742 return Reg;
2743 break;
2744 case AMDGPU::EXEC:
2745 case AMDGPU::FLAT_SCR:
2746 if (VT.getSizeInBits() == 64)
2747 return Reg;
2748 break;
2749 default:
2750 llvm_unreachable("missing register type checking")::llvm::llvm_unreachable_internal("missing register type checking"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2750)
;
2751 }
2752
2753 report_fatal_error(Twine("invalid type for register \""
2754 + StringRef(RegName) + "\"."));
2755}
2756
2757// If kill is not the last instruction, split the block so kill is always a
2758// proper terminator.
2759MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2760 MachineBasicBlock *BB) const {
2761 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2762
2763 MachineBasicBlock::iterator SplitPoint(&MI);
2764 ++SplitPoint;
2765
2766 if (SplitPoint == BB->end()) {
2767 // Don't bother with a new block.
2768 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2769 return BB;
2770 }
2771
2772 MachineFunction *MF = BB->getParent();
2773 MachineBasicBlock *SplitBB
2774 = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2775
2776 MF->insert(++MachineFunction::iterator(BB), SplitBB);
2777 SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2778
2779 SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2780 BB->addSuccessor(SplitBB);
2781
2782 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2783 return SplitBB;
2784}
2785
2786// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2787// wavefront. If the value is uniform and just happens to be in a VGPR, this
2788// will only do one iteration. In the worst case, this will loop 64 times.
2789//
2790// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2791static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2792 const SIInstrInfo *TII,
2793 MachineRegisterInfo &MRI,
2794 MachineBasicBlock &OrigBB,
2795 MachineBasicBlock &LoopBB,
2796 const DebugLoc &DL,
2797 const MachineOperand &IdxReg,
2798 unsigned InitReg,
2799 unsigned ResultReg,
2800 unsigned PhiReg,
2801 unsigned InitSaveExecReg,
2802 int Offset,
2803 bool UseGPRIdxMode,
2804 bool IsIndirectSrc) {
2805 MachineBasicBlock::iterator I = LoopBB.begin();
2806
2807 unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2808 unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2809 unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2810 unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2811
2812 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2813 .addReg(InitReg)
2814 .addMBB(&OrigBB)
2815 .addReg(ResultReg)
2816 .addMBB(&LoopBB);
2817
2818 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2819 .addReg(InitSaveExecReg)
2820 .addMBB(&OrigBB)
2821 .addReg(NewExec)
2822 .addMBB(&LoopBB);
2823
2824 // Read the next variant <- also loop target.
2825 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2826 .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2827
2828 // Compare the just read M0 value to all possible Idx values.
2829 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2830 .addReg(CurrentIdxReg)
2831 .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2832
2833 // Update EXEC, save the original EXEC value to VCC.
2834 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2835 .addReg(CondReg, RegState::Kill);
2836
2837 MRI.setSimpleHint(NewExec, CondReg);
2838
2839 if (UseGPRIdxMode) {
2840 unsigned IdxReg;
2841 if (Offset == 0) {
2842 IdxReg = CurrentIdxReg;
2843 } else {
2844 IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2845 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2846 .addReg(CurrentIdxReg, RegState::Kill)
2847 .addImm(Offset);
2848 }
2849 unsigned IdxMode = IsIndirectSrc ?
2850 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2851 MachineInstr *SetOn =
2852 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2853 .addReg(IdxReg, RegState::Kill)
2854 .addImm(IdxMode);
2855 SetOn->getOperand(3).setIsUndef();
2856 } else {
2857 // Move index from VCC into M0
2858 if (Offset == 0) {
2859 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2860 .addReg(CurrentIdxReg, RegState::Kill);
2861 } else {
2862 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2863 .addReg(CurrentIdxReg, RegState::Kill)
2864 .addImm(Offset);
2865 }
2866 }
2867
2868 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2869 MachineInstr *InsertPt =
2870 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2871 .addReg(AMDGPU::EXEC)
2872 .addReg(NewExec);
2873
2874 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2875 // s_cbranch_scc0?
2876
2877 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2878 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2879 .addMBB(&LoopBB);
2880
2881 return InsertPt->getIterator();
2882}
2883
2884// This has slightly sub-optimal regalloc when the source vector is killed by
2885// the read. The register allocator does not understand that the kill is
2886// per-workitem, so is kept alive for the whole loop so we end up not re-using a
2887// subregister from it, using 1 more VGPR than necessary. This was saved when
2888// this was expanded after register allocation.
2889static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2890 MachineBasicBlock &MBB,
2891 MachineInstr &MI,
2892 unsigned InitResultReg,
2893 unsigned PhiReg,
2894 int Offset,
2895 bool UseGPRIdxMode,
2896 bool IsIndirectSrc) {
2897 MachineFunction *MF = MBB.getParent();
2898 MachineRegisterInfo &MRI = MF->getRegInfo();
2899 const DebugLoc &DL = MI.getDebugLoc();
2900 MachineBasicBlock::iterator I(&MI);
2901
2902 unsigned DstReg = MI.getOperand(0).getReg();
2903 unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2904 unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
2905
2906 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2907
2908 // Save the EXEC mask
2909 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2910 .addReg(AMDGPU::EXEC);
2911
2912 // To insert the loop we need to split the block. Move everything after this
2913 // point to a new block, and insert a new empty block between the two.
2914 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2915 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2916 MachineFunction::iterator MBBI(MBB);
2917 ++MBBI;
2918
2919 MF->insert(MBBI, LoopBB);
2920 MF->insert(MBBI, RemainderBB);
2921
2922 LoopBB->addSuccessor(LoopBB);
2923 LoopBB->addSuccessor(RemainderBB);
2924
2925 // Move the rest of the block into a new block.
2926 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2927 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2928
2929 MBB.addSuccessor(LoopBB);
2930
2931 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2932
2933 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2934 InitResultReg, DstReg, PhiReg, TmpExec,
2935 Offset, UseGPRIdxMode, IsIndirectSrc);
2936
2937 MachineBasicBlock::iterator First = RemainderBB->begin();
2938 BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2939 .addReg(SaveExec);
2940
2941 return InsPt;
2942}
2943
2944// Returns subreg index, offset
2945static std::pair<unsigned, int>
2946computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
2947 const TargetRegisterClass *SuperRC,
2948 unsigned VecReg,
2949 int Offset) {
2950 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2951
2952 // Skip out of bounds offsets, or else we would end up using an undefined
2953 // register.
2954 if (Offset >= NumElts || Offset < 0)
2955 return std::make_pair(AMDGPU::sub0, Offset);
2956
2957 return std::make_pair(AMDGPU::sub0 + Offset, 0);
2958}
2959
2960// Return true if the index is an SGPR and was set.
2961static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
2962 MachineRegisterInfo &MRI,
2963 MachineInstr &MI,
2964 int Offset,
2965 bool UseGPRIdxMode,
2966 bool IsIndirectSrc) {
2967 MachineBasicBlock *MBB = MI.getParent();
2968 const DebugLoc &DL = MI.getDebugLoc();
2969 MachineBasicBlock::iterator I(&MI);
2970
2971 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2972 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2973
2974 assert(Idx->getReg() != AMDGPU::NoRegister)((Idx->getReg() != AMDGPU::NoRegister) ? static_cast<void
> (0) : __assert_fail ("Idx->getReg() != AMDGPU::NoRegister"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 2974, __PRETTY_FUNCTION__))
;
2975
2976 if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2977 return false;
2978
2979 if (UseGPRIdxMode) {
2980 unsigned IdxMode = IsIndirectSrc ?
2981 VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2982 if (Offset == 0) {
2983 MachineInstr *SetOn =
2984 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2985 .add(*Idx)
2986 .addImm(IdxMode);
2987
2988 SetOn->getOperand(3).setIsUndef();
2989 } else {
2990 unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2991 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2992 .add(*Idx)
2993 .addImm(Offset);
2994 MachineInstr *SetOn =
2995 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2996 .addReg(Tmp, RegState::Kill)
2997 .addImm(IdxMode);
2998
2999 SetOn->getOperand(3).setIsUndef();
3000 }
3001
3002 return true;
3003 }
3004
3005 if (Offset == 0) {
3006 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3007 .add(*Idx);
3008 } else {
3009 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3010 .add(*Idx)
3011 .addImm(Offset);
3012 }
3013
3014 return true;
3015}
3016
3017// Control flow needs to be inserted if indexing with a VGPR.
3018static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3019 MachineBasicBlock &MBB,
3020 const GCNSubtarget &ST) {
3021 const SIInstrInfo *TII = ST.getInstrInfo();
3022 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3023 MachineFunction *MF = MBB.getParent();
3024 MachineRegisterInfo &MRI = MF->getRegInfo();
3025
3026 unsigned Dst = MI.getOperand(0).getReg();
3027 unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3028 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3029
3030 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3031
3032 unsigned SubReg;
3033 std::tie(SubReg, Offset)
3034 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3035
3036 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3037
3038 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3039 MachineBasicBlock::iterator I(&MI);
3040 const DebugLoc &DL = MI.getDebugLoc();
3041
3042 if (UseGPRIdxMode) {
3043 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3044 // to avoid interfering with other uses, so probably requires a new
3045 // optimization pass.
3046 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3047 .addReg(SrcReg, RegState::Undef, SubReg)
3048 .addReg(SrcReg, RegState::Implicit)
3049 .addReg(AMDGPU::M0, RegState::Implicit);
3050 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3051 } else {
3052 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3053 .addReg(SrcReg, RegState::Undef, SubReg)
3054 .addReg(SrcReg, RegState::Implicit);
3055 }
3056
3057 MI.eraseFromParent();
3058
3059 return &MBB;
3060 }
3061
3062 const DebugLoc &DL = MI.getDebugLoc();
3063 MachineBasicBlock::iterator I(&MI);
3064
3065 unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3066 unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3067
3068 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3069
3070 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3071 Offset, UseGPRIdxMode, true);
3072 MachineBasicBlock *LoopBB = InsPt->getParent();
3073
3074 if (UseGPRIdxMode) {
3075 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3076 .addReg(SrcReg, RegState::Undef, SubReg)
3077 .addReg(SrcReg, RegState::Implicit)
3078 .addReg(AMDGPU::M0, RegState::Implicit);
3079 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3080 } else {
3081 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3082 .addReg(SrcReg, RegState::Undef, SubReg)
3083 .addReg(SrcReg, RegState::Implicit);
3084 }
3085
3086 MI.eraseFromParent();
3087
3088 return LoopBB;
3089}
3090
3091static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3092 const TargetRegisterClass *VecRC) {
3093 switch (TRI.getRegSizeInBits(*VecRC)) {
3094 case 32: // 4 bytes
3095 return AMDGPU::V_MOVRELD_B32_V1;
3096 case 64: // 8 bytes
3097 return AMDGPU::V_MOVRELD_B32_V2;
3098 case 128: // 16 bytes
3099 return AMDGPU::V_MOVRELD_B32_V4;
3100 case 256: // 32 bytes
3101 return AMDGPU::V_MOVRELD_B32_V8;
3102 case 512: // 64 bytes
3103 return AMDGPU::V_MOVRELD_B32_V16;
3104 default:
3105 llvm_unreachable("unsupported size for MOVRELD pseudos")::llvm::llvm_unreachable_internal("unsupported size for MOVRELD pseudos"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3105)
;
3106 }
3107}
3108
3109static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3110 MachineBasicBlock &MBB,
3111 const GCNSubtarget &ST) {
3112 const SIInstrInfo *TII = ST.getInstrInfo();
3113 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3114 MachineFunction *MF = MBB.getParent();
3115 MachineRegisterInfo &MRI = MF->getRegInfo();
3116
3117 unsigned Dst = MI.getOperand(0).getReg();
3118 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3119 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3120 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3121 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3122 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3123
3124 // This can be an immediate, but will be folded later.
3125 assert(Val->getReg())((Val->getReg()) ? static_cast<void> (0) : __assert_fail
("Val->getReg()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3125, __PRETTY_FUNCTION__))
;
3126
3127 unsigned SubReg;
3128 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3129 SrcVec->getReg(),
3130 Offset);
3131 bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3132
3133 if (Idx->getReg() == AMDGPU::NoRegister) {
3134 MachineBasicBlock::iterator I(&MI);
3135 const DebugLoc &DL = MI.getDebugLoc();
3136
3137 assert(Offset == 0)((Offset == 0) ? static_cast<void> (0) : __assert_fail (
"Offset == 0", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3137, __PRETTY_FUNCTION__))
;
3138
3139 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3140 .add(*SrcVec)
3141 .add(*Val)
3142 .addImm(SubReg);
3143
3144 MI.eraseFromParent();
3145 return &MBB;
3146 }
3147
3148 if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3149 MachineBasicBlock::iterator I(&MI);
3150 const DebugLoc &DL = MI.getDebugLoc();
3151
3152 if (UseGPRIdxMode) {
3153 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3154 .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3155 .add(*Val)
3156 .addReg(Dst, RegState::ImplicitDefine)
3157 .addReg(SrcVec->getReg(), RegState::Implicit)
3158 .addReg(AMDGPU::M0, RegState::Implicit);
3159
3160 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3161 } else {
3162 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3163
3164 BuildMI(MBB, I, DL, MovRelDesc)
3165 .addReg(Dst, RegState::Define)
3166 .addReg(SrcVec->getReg())
3167 .add(*Val)
3168 .addImm(SubReg - AMDGPU::sub0);
3169 }
3170
3171 MI.eraseFromParent();
3172 return &MBB;
3173 }
3174
3175 if (Val->isReg())
3176 MRI.clearKillFlags(Val->getReg());
3177
3178 const DebugLoc &DL = MI.getDebugLoc();
3179
3180 unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3181
3182 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3183 Offset, UseGPRIdxMode, false);
3184 MachineBasicBlock *LoopBB = InsPt->getParent();
3185
3186 if (UseGPRIdxMode) {
3187 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3188 .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3189 .add(*Val) // src0
3190 .addReg(Dst, RegState::ImplicitDefine)
3191 .addReg(PhiReg, RegState::Implicit)
3192 .addReg(AMDGPU::M0, RegState::Implicit);
3193 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3194 } else {
3195 const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3196
3197 BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3198 .addReg(Dst, RegState::Define)
3199 .addReg(PhiReg)
3200 .add(*Val)
3201 .addImm(SubReg - AMDGPU::sub0);
3202 }
3203
3204 MI.eraseFromParent();
3205
3206 return LoopBB;
3207}
3208
3209MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3210 MachineInstr &MI, MachineBasicBlock *BB) const {
3211
3212 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3213 MachineFunction *MF = BB->getParent();
3214 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3215
3216 if (TII->isMIMG(MI)) {
3217 if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3218 report_fatal_error("missing mem operand from MIMG instruction");
3219 }
3220 // Add a memoperand for mimg instructions so that they aren't assumed to
3221 // be ordered memory instuctions.
3222
3223 return BB;
3224 }
3225
3226 switch (MI.getOpcode()) {
3227 case AMDGPU::S_ADD_U64_PSEUDO:
3228 case AMDGPU::S_SUB_U64_PSEUDO: {
3229 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3230 const DebugLoc &DL = MI.getDebugLoc();
3231
3232 MachineOperand &Dest = MI.getOperand(0);
3233 MachineOperand &Src0 = MI.getOperand(1);
3234 MachineOperand &Src1 = MI.getOperand(2);
3235
3236 unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3237 unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3238
3239 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3240 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3241 &AMDGPU::SReg_32_XM0RegClass);
3242 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3243 Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3244 &AMDGPU::SReg_32_XM0RegClass);
3245
3246 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3247 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3248 &AMDGPU::SReg_32_XM0RegClass);
3249 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3250 Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3251 &AMDGPU::SReg_32_XM0RegClass);
3252
3253 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3254
3255 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3256 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3257 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3258 .add(Src0Sub0)
3259 .add(Src1Sub0);
3260 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3261 .add(Src0Sub1)
3262 .add(Src1Sub1);
3263 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3264 .addReg(DestSub0)
3265 .addImm(AMDGPU::sub0)
3266 .addReg(DestSub1)
3267 .addImm(AMDGPU::sub1);
3268 MI.eraseFromParent();
3269 return BB;
3270 }
3271 case AMDGPU::SI_INIT_M0: {
3272 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3273 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3274 .add(MI.getOperand(0));
3275 MI.eraseFromParent();
3276 return BB;
3277 }
3278 case AMDGPU::SI_INIT_EXEC:
3279 // This should be before all vector instructions.
3280 BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3281 AMDGPU::EXEC)
3282 .addImm(MI.getOperand(0).getImm());
3283 MI.eraseFromParent();
3284 return BB;
3285
3286 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3287 // Extract the thread count from an SGPR input and set EXEC accordingly.
3288 // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3289 //
3290 // S_BFE_U32 count, input, {shift, 7}
3291 // S_BFM_B64 exec, count, 0
3292 // S_CMP_EQ_U32 count, 64
3293 // S_CMOV_B64 exec, -1
3294 MachineInstr *FirstMI = &*BB->begin();
3295 MachineRegisterInfo &MRI = MF->getRegInfo();
3296 unsigned InputReg = MI.getOperand(0).getReg();
3297 unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3298 bool Found = false;
3299
3300 // Move the COPY of the input reg to the beginning, so that we can use it.
3301 for (auto I = BB->begin(); I != &MI; I++) {
3302 if (I->getOpcode() != TargetOpcode::COPY ||
3303 I->getOperand(0).getReg() != InputReg)
3304 continue;
3305
3306 if (I == FirstMI) {
3307 FirstMI = &*++BB->begin();
3308 } else {
3309 I->removeFromParent();
3310 BB->insert(FirstMI, &*I);
3311 }
3312 Found = true;
3313 break;
3314 }
3315 assert(Found)((Found) ? static_cast<void> (0) : __assert_fail ("Found"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3315, __PRETTY_FUNCTION__))
;
3316 (void)Found;
3317
3318 // This should be before all vector instructions.
3319 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3320 .addReg(InputReg)
3321 .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3322 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3323 AMDGPU::EXEC)
3324 .addReg(CountReg)
3325 .addImm(0);
3326 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3327 .addReg(CountReg, RegState::Kill)
3328 .addImm(64);
3329 BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3330 AMDGPU::EXEC)
3331 .addImm(-1);
3332 MI.eraseFromParent();
3333 return BB;
3334 }
3335
3336 case AMDGPU::GET_GROUPSTATICSIZE: {
3337 DebugLoc DL = MI.getDebugLoc();
3338 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3339 .add(MI.getOperand(0))
3340 .addImm(MFI->getLDSSize());
3341 MI.eraseFromParent();
3342 return BB;
3343 }
3344 case AMDGPU::SI_INDIRECT_SRC_V1:
3345 case AMDGPU::SI_INDIRECT_SRC_V2:
3346 case AMDGPU::SI_INDIRECT_SRC_V4:
3347 case AMDGPU::SI_INDIRECT_SRC_V8:
3348 case AMDGPU::SI_INDIRECT_SRC_V16:
3349 return emitIndirectSrc(MI, *BB, *getSubtarget());
3350 case AMDGPU::SI_INDIRECT_DST_V1:
3351 case AMDGPU::SI_INDIRECT_DST_V2:
3352 case AMDGPU::SI_INDIRECT_DST_V4:
3353 case AMDGPU::SI_INDIRECT_DST_V8:
3354 case AMDGPU::SI_INDIRECT_DST_V16:
3355 return emitIndirectDst(MI, *BB, *getSubtarget());
3356 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3357 case AMDGPU::SI_KILL_I1_PSEUDO:
3358 return splitKillBlock(MI, BB);
3359 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3360 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3361
3362 unsigned Dst = MI.getOperand(0).getReg();
3363 unsigned Src0 = MI.getOperand(1).getReg();
3364 unsigned Src1 = MI.getOperand(2).getReg();
3365 const DebugLoc &DL = MI.getDebugLoc();
3366 unsigned SrcCond = MI.getOperand(3).getReg();
3367
3368 unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3369 unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3370 unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3371
3372 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3373 .addReg(SrcCond);
3374 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3375 .addReg(Src0, 0, AMDGPU::sub0)
3376 .addReg(Src1, 0, AMDGPU::sub0)
3377 .addReg(SrcCondCopy);
3378 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3379 .addReg(Src0, 0, AMDGPU::sub1)
3380 .addReg(Src1, 0, AMDGPU::sub1)
3381 .addReg(SrcCondCopy);
3382
3383 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3384 .addReg(DstLo)
3385 .addImm(AMDGPU::sub0)
3386 .addReg(DstHi)
3387 .addImm(AMDGPU::sub1);
3388 MI.eraseFromParent();
3389 return BB;
3390 }
3391 case AMDGPU::SI_BR_UNDEF: {
3392 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3393 const DebugLoc &DL = MI.getDebugLoc();
3394 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3395 .add(MI.getOperand(0));
3396 Br->getOperand(1).setIsUndef(true); // read undef SCC
3397 MI.eraseFromParent();
3398 return BB;
3399 }
3400 case AMDGPU::ADJCALLSTACKUP:
3401 case AMDGPU::ADJCALLSTACKDOWN: {
3402 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3403 MachineInstrBuilder MIB(*MF, &MI);
3404
3405 // Add an implicit use of the frame offset reg to prevent the restore copy
3406 // inserted after the call from being reorderd after stack operations in the
3407 // the caller's frame.
3408 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3409 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3410 .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3411 return BB;
3412 }
3413 case AMDGPU::SI_CALL_ISEL:
3414 case AMDGPU::SI_TCRETURN_ISEL: {
3415 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3416 const DebugLoc &DL = MI.getDebugLoc();
3417 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3418
3419 MachineRegisterInfo &MRI = MF->getRegInfo();
3420 unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3421 MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3422 assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET)((PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET) ? static_cast
<void> (0) : __assert_fail ("PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3422, __PRETTY_FUNCTION__))
;
3423
3424 const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3425
3426 MachineInstrBuilder MIB;
3427 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3428 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3429 .add(MI.getOperand(0))
3430 .addGlobalAddress(G);
3431 } else {
3432 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3433 .add(MI.getOperand(0))
3434 .addGlobalAddress(G);
3435
3436 // There is an additional imm operand for tcreturn, but it should be in the
3437 // right place already.
3438 }
3439
3440 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3441 MIB.add(MI.getOperand(I));
3442
3443 MIB.cloneMemRefs(MI);
3444 MI.eraseFromParent();
3445 return BB;
3446 }
3447 default:
3448 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
3449 }
3450}
3451
3452bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3453 return isTypeLegal(VT.getScalarType());
3454}
3455
3456bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3457 // This currently forces unfolding various combinations of fsub into fma with
3458 // free fneg'd operands. As long as we have fast FMA (controlled by
3459 // isFMAFasterThanFMulAndFAdd), we should perform these.
3460
3461 // When fma is quarter rate, for f64 where add / sub are at best half rate,
3462 // most of these combines appear to be cycle neutral but save on instruction
3463 // count / code size.
3464 return true;
3465}
3466
3467EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3468 EVT VT) const {
3469 if (!VT.isVector()) {
3470 return MVT::i1;
3471 }
3472 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3473}
3474
3475MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3476 // TODO: Should i16 be used always if legal? For now it would force VALU
3477 // shifts.
3478 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3479}
3480
3481// Answering this is somewhat tricky and depends on the specific device which
3482// have different rates for fma or all f64 operations.
3483//
3484// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3485// regardless of which device (although the number of cycles differs between
3486// devices), so it is always profitable for f64.
3487//
3488// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3489// only on full rate devices. Normally, we should prefer selecting v_mad_f32
3490// which we can always do even without fused FP ops since it returns the same
3491// result as the separate operations and since it is always full
3492// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3493// however does not support denormals, so we do report fma as faster if we have
3494// a fast fma device and require denormals.
3495//
3496bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3497 VT = VT.getScalarType();
3498
3499 switch (VT.getSimpleVT().SimpleTy) {
3500 case MVT::f32: {
3501 // This is as fast on some subtargets. However, we always have full rate f32
3502 // mad available which returns the same result as the separate operations
3503 // which we should prefer over fma. We can't use this if we want to support
3504 // denormals, so only report this in these cases.
3505 if (Subtarget->hasFP32Denormals())
3506 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3507
3508 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3509 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3510 }
3511 case MVT::f64:
3512 return true;
3513 case MVT::f16:
3514 return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3515 default:
3516 break;
3517 }
3518
3519 return false;
3520}
3521
3522//===----------------------------------------------------------------------===//
3523// Custom DAG Lowering Operations
3524//===----------------------------------------------------------------------===//
3525
3526// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3527// wider vector type is legal.
3528SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3529 SelectionDAG &DAG) const {
3530 unsigned Opc = Op.getOpcode();
3531 EVT VT = Op.getValueType();
3532 assert(VT == MVT::v4f16)((VT == MVT::v4f16) ? static_cast<void> (0) : __assert_fail
("VT == MVT::v4f16", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3532, __PRETTY_FUNCTION__))
;
3533
3534 SDValue Lo, Hi;
3535 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3536
3537 SDLoc SL(Op);
3538 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3539 Op->getFlags());
3540 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3541 Op->getFlags());
3542
3543 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3544}
3545
3546// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3547// wider vector type is legal.
3548SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3549 SelectionDAG &DAG) const {
3550 unsigned Opc = Op.getOpcode();
3551 EVT VT = Op.getValueType();
3552 assert(VT == MVT::v4i16 || VT == MVT::v4f16)((VT == MVT::v4i16 || VT == MVT::v4f16) ? static_cast<void
> (0) : __assert_fail ("VT == MVT::v4i16 || VT == MVT::v4f16"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3552, __PRETTY_FUNCTION__))
;
3553
3554 SDValue Lo0, Hi0;
3555 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3556 SDValue Lo1, Hi1;
3557 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3558
3559 SDLoc SL(Op);
3560
3561 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3562 Op->getFlags());
3563 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3564 Op->getFlags());
3565
3566 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3567}
3568
3569SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3570 switch (Op.getOpcode()) {
3571 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3572 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3573 case ISD::LOAD: {
3574 SDValue Result = LowerLOAD(Op, DAG);
3575 assert((!Result.getNode() ||(((!Result.getNode() || Result.getNode()->getNumValues() ==
2) && "Load should return a value and a chain") ? static_cast
<void> (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3577, __PRETTY_FUNCTION__))
3576 Result.getNode()->getNumValues() == 2) &&(((!Result.getNode() || Result.getNode()->getNumValues() ==
2) && "Load should return a value and a chain") ? static_cast
<void> (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3577, __PRETTY_FUNCTION__))
3577 "Load should return a value and a chain")(((!Result.getNode() || Result.getNode()->getNumValues() ==
2) && "Load should return a value and a chain") ? static_cast
<void> (0) : __assert_fail ("(!Result.getNode() || Result.getNode()->getNumValues() == 2) && \"Load should return a value and a chain\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3577, __PRETTY_FUNCTION__))
;
3578 return Result;
3579 }
3580
3581 case ISD::FSIN:
3582 case ISD::FCOS:
3583 return LowerTrig(Op, DAG);
3584 case ISD::SELECT: return LowerSELECT(Op, DAG);
3585 case ISD::FDIV: return LowerFDIV(Op, DAG);
3586 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3587 case ISD::STORE: return LowerSTORE(Op, DAG);
3588 case ISD::GlobalAddress: {
3589 MachineFunction &MF = DAG.getMachineFunction();
3590 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3591 return LowerGlobalAddress(MFI, Op, DAG);
3592 }
3593 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3594 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3595 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3596 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3597 case ISD::INSERT_VECTOR_ELT:
3598 return lowerINSERT_VECTOR_ELT(Op, DAG);
3599 case ISD::EXTRACT_VECTOR_ELT:
3600 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3601 case ISD::BUILD_VECTOR:
3602 return lowerBUILD_VECTOR(Op, DAG);
3603 case ISD::FP_ROUND:
3604 return lowerFP_ROUND(Op, DAG);
3605 case ISD::TRAP:
3606 return lowerTRAP(Op, DAG);
3607 case ISD::DEBUGTRAP:
3608 return lowerDEBUGTRAP(Op, DAG);
3609 case ISD::FABS:
3610 case ISD::FNEG:
3611 case ISD::FCANONICALIZE:
3612 return splitUnaryVectorOp(Op, DAG);
3613 case ISD::FMINNUM:
3614 case ISD::FMAXNUM:
3615 return lowerFMINNUM_FMAXNUM(Op, DAG);
3616 case ISD::SHL:
3617 case ISD::SRA:
3618 case ISD::SRL:
3619 case ISD::ADD:
3620 case ISD::SUB:
3621 case ISD::MUL:
3622 case ISD::SMIN:
3623 case ISD::SMAX:
3624 case ISD::UMIN:
3625 case ISD::UMAX:
3626 case ISD::FADD:
3627 case ISD::FMUL:
3628 case ISD::FMINNUM_IEEE:
3629 case ISD::FMAXNUM_IEEE:
3630 return splitBinaryVectorOp(Op, DAG);
3631 }
3632 return SDValue();
3633}
3634
3635static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3636 const SDLoc &DL,
3637 SelectionDAG &DAG, bool Unpacked) {
3638 if (!LoadVT.isVector())
3639 return Result;
3640
3641 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3642 // Truncate to v2i16/v4i16.
3643 EVT IntLoadVT = LoadVT.changeTypeToInteger();
3644
3645 // Workaround legalizer not scalarizing truncate after vector op
3646 // legalization byt not creating intermediate vector trunc.
3647 SmallVector<SDValue, 4> Elts;
3648 DAG.ExtractVectorElements(Result, Elts);
3649 for (SDValue &Elt : Elts)
3650 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3651
3652 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3653
3654 // Bitcast to original type (v2f16/v4f16).
3655 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3656 }
3657
3658 // Cast back to the original packed type.
3659 return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3660}
3661
3662SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3663 MemSDNode *M,
3664 SelectionDAG &DAG,
3665 ArrayRef<SDValue> Ops,
3666 bool IsIntrinsic) const {
3667 SDLoc DL(M);
3668
3669 bool Unpacked = Subtarget->hasUnpackedD16VMem();
3670 EVT LoadVT = M->getValueType(0);
3671
3672 EVT EquivLoadVT = LoadVT;
3673 if (Unpacked && LoadVT.isVector()) {
3674 EquivLoadVT = LoadVT.isVector() ?
3675 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3676 LoadVT.getVectorNumElements()) : LoadVT;
3677 }
3678
3679 // Change from v4f16/v2f16 to EquivLoadVT.
3680 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3681
3682 SDValue Load
3683 = DAG.getMemIntrinsicNode(
3684 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3685 VTList, Ops, M->getMemoryVT(),
3686 M->getMemOperand());
3687 if (!Unpacked) // Just adjusted the opcode.
3688 return Load;
3689
3690 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3691
3692 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3693}
3694
3695static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3696 SDNode *N, SelectionDAG &DAG) {
3697 EVT VT = N->getValueType(0);
3698 const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3699 if (!CD)
3700 return DAG.getUNDEF(VT);
3701
3702 int CondCode = CD->getSExtValue();
3703 if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3704 CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3705 return DAG.getUNDEF(VT);
3706
3707 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3708
3709
3710 SDValue LHS = N->getOperand(1);
3711 SDValue RHS = N->getOperand(2);
3712
3713 SDLoc DL(N);
3714
3715 EVT CmpVT = LHS.getValueType();
3716 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3717 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3718 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3719 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3720 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3721 }
3722
3723 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3724
3725 return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3726 DAG.getCondCode(CCOpcode));
3727}
3728
3729static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3730 SDNode *N, SelectionDAG &DAG) {
3731 EVT VT = N->getValueType(0);
3732 const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3));
3733 if (!CD)
3734 return DAG.getUNDEF(VT);
3735
3736 int CondCode = CD->getSExtValue();
3737 if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3738 CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3739 return DAG.getUNDEF(VT);
3740 }
3741
3742 SDValue Src0 = N->getOperand(1);
3743 SDValue Src1 = N->getOperand(2);
3744 EVT CmpVT = Src0.getValueType();
3745 SDLoc SL(N);
3746
3747 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3748 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3749 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3750 }
3751
3752 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3753 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3754 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3755 Src1, DAG.getCondCode(CCOpcode));
3756}
3757
3758void SITargetLowering::ReplaceNodeResults(SDNode *N,
3759 SmallVectorImpl<SDValue> &Results,
3760 SelectionDAG &DAG) const {
3761 switch (N->getOpcode()) {
3762 case ISD::INSERT_VECTOR_ELT: {
3763 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3764 Results.push_back(Res);
3765 return;
3766 }
3767 case ISD::EXTRACT_VECTOR_ELT: {
3768 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3769 Results.push_back(Res);
3770 return;
3771 }
3772 case ISD::INTRINSIC_WO_CHAIN: {
3773 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3774 switch (IID) {
3775 case Intrinsic::amdgcn_cvt_pkrtz: {
3776 SDValue Src0 = N->getOperand(1);
3777 SDValue Src1 = N->getOperand(2);
3778 SDLoc SL(N);
3779 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3780 Src0, Src1);
3781 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3782 return;
3783 }
3784 case Intrinsic::amdgcn_cvt_pknorm_i16:
3785 case Intrinsic::amdgcn_cvt_pknorm_u16:
3786 case Intrinsic::amdgcn_cvt_pk_i16:
3787 case Intrinsic::amdgcn_cvt_pk_u16: {
3788 SDValue Src0 = N->getOperand(1);
3789 SDValue Src1 = N->getOperand(2);
3790 SDLoc SL(N);
3791 unsigned Opcode;
3792
3793 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3794 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3795 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3796 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3797 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3798 Opcode = AMDGPUISD::CVT_PK_I16_I32;
3799 else
3800 Opcode = AMDGPUISD::CVT_PK_U16_U32;
3801
3802 EVT VT = N->getValueType(0);
3803 if (isTypeLegal(VT))
3804 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3805 else {
3806 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3807 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3808 }
3809 return;
3810 }
3811 }
3812 break;
3813 }
3814 case ISD::INTRINSIC_W_CHAIN: {
3815 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3816 Results.push_back(Res);
3817 Results.push_back(Res.getValue(1));
3818 return;
3819 }
3820
3821 break;
3822 }
3823 case ISD::SELECT: {
3824 SDLoc SL(N);
3825 EVT VT = N->getValueType(0);
3826 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3827 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3828 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3829
3830 EVT SelectVT = NewVT;
3831 if (NewVT.bitsLT(MVT::i32)) {
3832 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3833 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3834 SelectVT = MVT::i32;
3835 }
3836
3837 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3838 N->getOperand(0), LHS, RHS);
3839
3840 if (NewVT != SelectVT)
3841 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3842 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3843 return;
3844 }
3845 case ISD::FNEG: {
3846 if (N->getValueType(0) != MVT::v2f16)
3847 break;
3848
3849 SDLoc SL(N);
3850 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3851
3852 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3853 BC,
3854 DAG.getConstant(0x80008000, SL, MVT::i32));
3855 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3856 return;
3857 }
3858 case ISD::FABS: {
3859 if (N->getValueType(0) != MVT::v2f16)
3860 break;
3861
3862 SDLoc SL(N);
3863 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3864
3865 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3866 BC,
3867 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3868 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3869 return;
3870 }
3871 default:
3872 break;
3873 }
3874}
3875
3876/// Helper function for LowerBRCOND
3877static SDNode *findUser(SDValue Value, unsigned Opcode) {
3878
3879 SDNode *Parent = Value.getNode();
3880 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3881 I != E; ++I) {
3882
3883 if (I.getUse().get() != Value)
3884 continue;
3885
3886 if (I->getOpcode() == Opcode)
3887 return *I;
3888 }
3889 return nullptr;
3890}
3891
3892unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3893 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3894 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3895 case Intrinsic::amdgcn_if:
3896 return AMDGPUISD::IF;
3897 case Intrinsic::amdgcn_else:
3898 return AMDGPUISD::ELSE;
3899 case Intrinsic::amdgcn_loop:
3900 return AMDGPUISD::LOOP;
3901 case Intrinsic::amdgcn_end_cf:
3902 llvm_unreachable("should not occur")::llvm::llvm_unreachable_internal("should not occur", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3902)
;
3903 default:
3904 return 0;
3905 }
3906 }
3907
3908 // break, if_break, else_break are all only used as inputs to loop, not
3909 // directly as branch conditions.
3910 return 0;
3911}
3912
3913void SITargetLowering::createDebuggerPrologueStackObjects(
3914 MachineFunction &MF) const {
3915 // Create stack objects that are used for emitting debugger prologue.
3916 //
3917 // Debugger prologue writes work group IDs and work item IDs to scratch memory
3918 // at fixed location in the following format:
3919 // offset 0: work group ID x
3920 // offset 4: work group ID y
3921 // offset 8: work group ID z
3922 // offset 16: work item ID x
3923 // offset 20: work item ID y
3924 // offset 24: work item ID z
3925 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3926 int ObjectIdx = 0;
3927
3928 // For each dimension:
3929 for (unsigned i = 0; i < 3; ++i) {
3930 // Create fixed stack object for work group ID.
3931 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3932 Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3933 // Create fixed stack object for work item ID.
3934 ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3935 Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3936 }
3937}
3938
3939bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3940 const Triple &TT = getTargetMachine().getTargetTriple();
3941 return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3942 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
3943 AMDGPU::shouldEmitConstantsToTextSection(TT);
3944}
3945
3946bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3947 return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
3948 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3949 GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
3950 !shouldEmitFixup(GV) &&
3951 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3952}
3953
3954bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3955 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3956}
3957
3958/// This transforms the control flow intrinsics to get the branch destination as
3959/// last parameter, also switches branch target with BR if the need arise
3960SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3961 SelectionDAG &DAG) const {
3962 SDLoc DL(BRCOND);
3963
3964 SDNode *Intr = BRCOND.getOperand(1).getNode();
3965 SDValue Target = BRCOND.getOperand(2);
3966 SDNode *BR = nullptr;
3967 SDNode *SetCC = nullptr;
3968
3969 if (Intr->getOpcode() == ISD::SETCC) {
3970 // As long as we negate the condition everything is fine
3971 SetCC = Intr;
3972 Intr = SetCC->getOperand(0).getNode();
3973
3974 } else {
3975 // Get the target from BR if we don't negate the condition
3976 BR = findUser(BRCOND, ISD::BR);
3977 Target = BR->getOperand(1);
3978 }
3979
3980 // FIXME: This changes the types of the intrinsics instead of introducing new
3981 // nodes with the correct types.
3982 // e.g. llvm.amdgcn.loop
3983
3984 // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3985 // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3986
3987 unsigned CFNode = isCFIntrinsic(Intr);
3988 if (CFNode == 0) {
3989 // This is a uniform branch so we don't need to legalize.
3990 return BRCOND;
3991 }
3992
3993 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3994 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3995
3996 assert(!SetCC ||((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
3997 (SetCC->getConstantOperandVal(1) == 1 &&((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
3998 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
3999 ISD::SETNE))((!SetCC || (SetCC->getConstantOperandVal(1) == 1 &&
cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()
)->get() == ISD::SETNE)) ? static_cast<void> (0) : __assert_fail
("!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 3999, __PRETTY_FUNCTION__))
;
4000
4001 // operands of the new intrinsic call
4002 SmallVector<SDValue, 4> Ops;
4003 if (HaveChain)
4004 Ops.push_back(BRCOND.getOperand(0));
4005
4006 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
4007 Ops.push_back(Target);
4008
4009 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4010
4011 // build the new intrinsic call
4012 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
4013
4014 if (!HaveChain) {
4015 SDValue Ops[] = {
4016 SDValue(Result, 0),
4017 BRCOND.getOperand(0)
4018 };
4019
4020 Result = DAG.getMergeValues(Ops, DL).getNode();
4021 }
4022
4023 if (BR) {
4024 // Give the branch instruction our target
4025 SDValue Ops[] = {
4026 BR->getOperand(0),
4027 BRCOND.getOperand(2)
4028 };
4029 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4030 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4031 BR = NewBR.getNode();
4032 }
4033
4034 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4035
4036 // Copy the intrinsic results to registers
4037 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4038 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4039 if (!CopyToReg)
4040 continue;
4041
4042 Chain = DAG.getCopyToReg(
4043 Chain, DL,
4044 CopyToReg->getOperand(1),
4045 SDValue(Result, i - 1),
4046 SDValue());
4047
4048 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4049 }
4050
4051 // Remove the old intrinsic from the chain
4052 DAG.ReplaceAllUsesOfValueWith(
4053 SDValue(Intr, Intr->getNumValues() - 1),
4054 Intr->getOperand(0));
4055
4056 return Chain;
4057}
4058
4059SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4060 SDValue Op,
4061 const SDLoc &DL,
4062 EVT VT) const {
4063 return Op.getValueType().bitsLE(VT) ?
4064 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4065 DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4066}
4067
4068SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4069 assert(Op.getValueType() == MVT::f16 &&((Op.getValueType() == MVT::f16 && "Do not know how to custom lower FP_ROUND for non-f16 type"
) ? static_cast<void> (0) : __assert_fail ("Op.getValueType() == MVT::f16 && \"Do not know how to custom lower FP_ROUND for non-f16 type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4070, __PRETTY_FUNCTION__))
4070 "Do not know how to custom lower FP_ROUND for non-f16 type")((Op.getValueType() == MVT::f16 && "Do not know how to custom lower FP_ROUND for non-f16 type"
) ? static_cast<void> (0) : __assert_fail ("Op.getValueType() == MVT::f16 && \"Do not know how to custom lower FP_ROUND for non-f16 type\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4070, __PRETTY_FUNCTION__))
;
4071
4072 SDValue Src = Op.getOperand(0);
4073 EVT SrcVT = Src.getValueType();
4074 if (SrcVT != MVT::f64)
4075 return Op;
4076
4077 SDLoc DL(Op);
4078
4079 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4080 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4081 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4082}
4083
4084SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4085 SelectionDAG &DAG) const {
4086 EVT VT = Op.getValueType();
4087 bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
4088
4089 // FIXME: Assert during eslection that this is only selected for
4090 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4091 // mode functions, but this happens to be OK since it's only done in cases
4092 // where there is known no sNaN.
4093 if (IsIEEEMode)
4094 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4095
4096 if (VT == MVT::v4f16)
4097 return splitBinaryVectorOp(Op, DAG);
4098 return Op;
4099}
4100
4101SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4102 SDLoc SL(Op);
4103 SDValue Chain = Op.getOperand(0);
4104
4105 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4106 !Subtarget->isTrapHandlerEnabled())
4107 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4108
4109 MachineFunction &MF = DAG.getMachineFunction();
4110 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4111 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4112 assert(UserSGPR != AMDGPU::NoRegister)((UserSGPR != AMDGPU::NoRegister) ? static_cast<void> (
0) : __assert_fail ("UserSGPR != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4112, __PRETTY_FUNCTION__))
;
4113 SDValue QueuePtr = CreateLiveInRegister(
4114 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4115 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4116 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4117 QueuePtr, SDValue());
4118 SDValue Ops[] = {
4119 ToReg,
4120 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
4121 SGPR01,
4122 ToReg.getValue(1)
4123 };
4124 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4125}
4126
4127SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4128 SDLoc SL(Op);
4129 SDValue Chain = Op.getOperand(0);
4130 MachineFunction &MF = DAG.getMachineFunction();
4131
4132 if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4133 !Subtarget->isTrapHandlerEnabled()) {
4134 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
4135 "debugtrap handler not supported",
4136 Op.getDebugLoc(),
4137 DS_Warning);
4138 LLVMContext &Ctx = MF.getFunction().getContext();
4139 Ctx.diagnose(NoTrap);
4140 return Chain;
4141 }
4142
4143 SDValue Ops[] = {
4144 Chain,
4145 DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
4146 };
4147 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4148}
4149
4150SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4151 SelectionDAG &DAG) const {
4152 // FIXME: Use inline constants (src_{shared, private}_base) instead.
4153 if (Subtarget->hasApertureRegs()) {
4154 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4155 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4156 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
4157 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4158 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4159 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4160 unsigned Encoding =
4161 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4162 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4163 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4164
4165 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4166 SDValue ApertureReg = SDValue(
4167 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4168 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4169 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4170 }
4171
4172 MachineFunction &MF = DAG.getMachineFunction();
4173 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4174 unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4175 assert(UserSGPR != AMDGPU::NoRegister)((UserSGPR != AMDGPU::NoRegister) ? static_cast<void> (
0) : __assert_fail ("UserSGPR != AMDGPU::NoRegister", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4175, __PRETTY_FUNCTION__))
;
4176
4177 SDValue QueuePtr = CreateLiveInRegister(
4178 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4179
4180 // Offset into amd_queue_t for group_segment_aperture_base_hi /
4181 // private_segment_aperture_base_hi.
4182 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4183
4184 SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4185
4186 // TODO: Use custom target PseudoSourceValue.
4187 // TODO: We should use the value from the IR intrinsic call, but it might not
4188 // be available and how do we get it?
4189 Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
4190 AMDGPUAS::CONSTANT_ADDRESS));
4191
4192 MachinePointerInfo PtrInfo(V, StructOffset);
4193 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4194 MinAlign(64, StructOffset),
4195 MachineMemOperand::MODereferenceable |
4196 MachineMemOperand::MOInvariant);
4197}
4198
4199SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4200 SelectionDAG &DAG) const {
4201 SDLoc SL(Op);
4202 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4203
4204 SDValue Src = ASC->getOperand(0);
4205 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4206
4207 const AMDGPUTargetMachine &TM =
4208 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4209
4210 // flat -> local/private
4211 if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4212 unsigned DestAS = ASC->getDestAddressSpace();
4213
4214 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4215 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4216 unsigned NullVal = TM.getNullPointerValue(DestAS);
4217 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4218 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4219 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4220
4221 return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4222 NonNull, Ptr, SegmentNullPtr);
4223 }
4224 }
4225
4226 // local/private -> flat
4227 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4228 unsigned SrcAS = ASC->getSrcAddressSpace();
4229
4230 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4231 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4232 unsigned NullVal = TM.getNullPointerValue(SrcAS);
4233 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4234
4235 SDValue NonNull
4236 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4237
4238 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4239 SDValue CvtPtr
4240 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4241
4242 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4243 DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4244 FlatNullPtr);
4245 }
4246 }
4247
4248 // global <-> flat are no-ops and never emitted.
4249
4250 const MachineFunction &MF = DAG.getMachineFunction();
4251 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4252 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4253 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4254
4255 return DAG.getUNDEF(ASC->getValueType(0));
4256}
4257
4258SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4259 SelectionDAG &DAG) const {
4260 SDValue Vec = Op.getOperand(0);
4261 SDValue InsVal = Op.getOperand(1);
4262 SDValue Idx = Op.getOperand(2);
4263 EVT VecVT = Vec.getValueType();
4264 EVT EltVT = VecVT.getVectorElementType();
4265 unsigned VecSize = VecVT.getSizeInBits();
4266 unsigned EltSize = EltVT.getSizeInBits();
4267
4268
4269 assert(VecSize <= 64)((VecSize <= 64) ? static_cast<void> (0) : __assert_fail
("VecSize <= 64", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4269, __PRETTY_FUNCTION__))
;
4270
4271 unsigned NumElts = VecVT.getVectorNumElements();
4272 SDLoc SL(Op);
4273 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4274
4275 if (NumElts == 4 && EltSize == 16 && KIdx) {
4276 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4277
4278 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4279 DAG.getConstant(0, SL, MVT::i32));
4280 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4281 DAG.getConstant(1, SL, MVT::i32));
4282
4283 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4284 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4285
4286 unsigned Idx = KIdx->getZExtValue();
4287 bool InsertLo = Idx < 2;
4288 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4289 InsertLo ? LoVec : HiVec,
4290 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4291 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4292
4293 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4294
4295 SDValue Concat = InsertLo ?
4296 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4297 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4298
4299 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4300 }
4301
4302 if (isa<ConstantSDNode>(Idx))
4303 return SDValue();
4304
4305 MVT IntVT = MVT::getIntegerVT(VecSize);
4306
4307 // Avoid stack access for dynamic indexing.
4308 SDValue Val = InsVal;
4309 if (InsVal.getValueType() == MVT::f16)
4310 Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
4311
4312 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4313 SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
4314
4315 assert(isPowerOf2_32(EltSize))((isPowerOf2_32(EltSize)) ? static_cast<void> (0) : __assert_fail
("isPowerOf2_32(EltSize)", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4315, __PRETTY_FUNCTION__))
;
4316 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4317
4318 // Convert vector index to bit-index.
4319 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4320
4321 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4322 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4323 DAG.getConstant(0xffff, SL, IntVT),
4324 ScaledIdx);
4325
4326 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4327 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4328 DAG.getNOT(SL, BFM, IntVT), BCVec);
4329
4330 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4331 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4332}
4333
4334SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4335 SelectionDAG &DAG) const {
4336 SDLoc SL(Op);
4337
4338 EVT ResultVT = Op.getValueType();
4339 SDValue Vec = Op.getOperand(0);
4340 SDValue Idx = Op.getOperand(1);
4341 EVT VecVT = Vec.getValueType();
4342 unsigned VecSize = VecVT.getSizeInBits();
4343 EVT EltVT = VecVT.getVectorElementType();
4344 assert(VecSize <= 64)((VecSize <= 64) ? static_cast<void> (0) : __assert_fail
("VecSize <= 64", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4344, __PRETTY_FUNCTION__))
;
4345
4346 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4347
4348 // Make sure we do any optimizations that will make it easier to fold
4349 // source modifiers before obscuring it with bit operations.
4350
4351 // XXX - Why doesn't this get called when vector_shuffle is expanded?
4352 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4353 return Combined;
4354
4355 unsigned EltSize = EltVT.getSizeInBits();
4356 assert(isPowerOf2_32(EltSize))((isPowerOf2_32(EltSize)) ? static_cast<void> (0) : __assert_fail
("isPowerOf2_32(EltSize)", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4356, __PRETTY_FUNCTION__))
;
4357
4358 MVT IntVT = MVT::getIntegerVT(VecSize);
4359 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4360
4361 // Convert vector index to bit-index (* EltSize)
4362 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4363
4364 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4365 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4366
4367 if (ResultVT == MVT::f16) {
4368 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4369 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4370 }
4371
4372 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4373}
4374
4375SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4376 SelectionDAG &DAG) const {
4377 SDLoc SL(Op);
4378 EVT VT = Op.getValueType();
4379
4380 if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4381 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4382
4383 // Turn into pair of packed build_vectors.
4384 // TODO: Special case for constants that can be materialized with s_mov_b64.
4385 SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4386 { Op.getOperand(0), Op.getOperand(1) });
4387 SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4388 { Op.getOperand(2), Op.getOperand(3) });
4389
4390 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4391 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4392
4393 SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4394 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4395 }
4396
4397 assert(VT == MVT::v2f16 || VT == MVT::v2i16)((VT == MVT::v2f16 || VT == MVT::v2i16) ? static_cast<void
> (0) : __assert_fail ("VT == MVT::v2f16 || VT == MVT::v2i16"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4397, __PRETTY_FUNCTION__))
;
4398 assert(!Subtarget->hasVOP3PInsts() && "this should be legal")((!Subtarget->hasVOP3PInsts() && "this should be legal"
) ? static_cast<void> (0) : __assert_fail ("!Subtarget->hasVOP3PInsts() && \"this should be legal\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4398, __PRETTY_FUNCTION__))
;
4399
4400 SDValue Lo = Op.getOperand(0);
4401 SDValue Hi = Op.getOperand(1);
4402
4403 // Avoid adding defined bits with the zero_extend.
4404 if (Hi.isUndef()) {
4405 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4406 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4407 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4408 }
4409
4410 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4411 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4412
4413 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4414 DAG.getConstant(16, SL, MVT::i32));
4415 if (Lo.isUndef())
4416 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4417
4418 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4419 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4420
4421 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4422 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
4423}
4424
4425bool
4426SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4427 // We can fold offsets for anything that doesn't require a GOT relocation.
4428 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4429 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4430 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
4431 !shouldEmitGOTReloc(GA->getGlobal());
4432}
4433
4434static SDValue
4435buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4436 const SDLoc &DL, unsigned Offset, EVT PtrVT,
4437 unsigned GAFlags = SIInstrInfo::MO_NONE) {
4438 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4439 // lowered to the following code sequence:
4440 //
4441 // For constant address space:
4442 // s_getpc_b64 s[0:1]
4443 // s_add_u32 s0, s0, $symbol
4444 // s_addc_u32 s1, s1, 0
4445 //
4446 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4447 // a fixup or relocation is emitted to replace $symbol with a literal
4448 // constant, which is a pc-relative offset from the encoding of the $symbol
4449 // operand to the global variable.
4450 //
4451 // For global address space:
4452 // s_getpc_b64 s[0:1]
4453 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4454 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4455 //
4456 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
4457 // fixups or relocations are emitted to replace $symbol@*@lo and
4458 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4459 // which is a 64-bit pc-relative offset from the encoding of the $symbol
4460 // operand to the global variable.
4461 //
4462 // What we want here is an offset from the value returned by s_getpc
4463 // (which is the address of the s_add_u32 instruction) to the global
4464 // variable, but since the encoding of $symbol starts 4 bytes after the start
4465 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4466 // small. This requires us to add 4 to the global variable offset in order to
4467 // compute the correct address.
4468 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4469 GAFlags);
4470 SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4471 GAFlags == SIInstrInfo::MO_NONE ?
4472 GAFlags : GAFlags + 1);
4473 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4474}
4475
4476SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4477 SDValue Op,
4478 SelectionDAG &DAG) const {
4479 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4480 const GlobalValue *GV = GSD->getGlobal();
4481 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4482 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4483 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
4484 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4485
4486 SDLoc DL(GSD);
4487 EVT PtrVT = Op.getValueType();
4488
4489 // FIXME: Should not make address space based decisions here.
4490 if (shouldEmitFixup(GV))
4491 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4492 else if (shouldEmitPCReloc(GV))
4493 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4494 SIInstrInfo::MO_REL32);
4495
4496 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4497 SIInstrInfo::MO_GOTPCREL32);
4498
4499 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4500 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
4501 const DataLayout &DataLayout = DAG.getDataLayout();
4502 unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4503 MachinePointerInfo PtrInfo
4504 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
4505
4506 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4507 MachineMemOperand::MODereferenceable |
4508 MachineMemOperand::MOInvariant);
4509}
4510
4511SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4512 const SDLoc &DL, SDValue V) const {
4513 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4514 // the destination register.
4515 //
4516 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4517 // so we will end up with redundant moves to m0.
4518 //
4519 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4520
4521 // A Null SDValue creates a glue result.
4522 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4523 V, Chain);
4524 return SDValue(M0, 0);
4525}
4526
4527SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4528 SDValue Op,
4529 MVT VT,
4530 unsigned Offset) const {
4531 SDLoc SL(Op);
4532 SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
4533 DAG.getEntryNode(), Offset, 4, false);
4534 // The local size values will have the hi 16-bits as zero.
4535 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4536 DAG.getValueType(VT));
4537}
4538
4539static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4540 EVT VT) {
4541 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4542 "non-hsa intrinsic with hsa target",
4543 DL.getDebugLoc());
4544 DAG.getContext()->diagnose(BadIntrin);
4545 return DAG.getUNDEF(VT);
4546}
4547
4548static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4549 EVT VT) {
4550 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4551 "intrinsic not supported on subtarget",
4552 DL.getDebugLoc());
4553 DAG.getContext()->diagnose(BadIntrin);
4554 return DAG.getUNDEF(VT);
4555}
4556
4557static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4558 ArrayRef<SDValue> Elts) {
4559 assert(!Elts.empty())((!Elts.empty()) ? static_cast<void> (0) : __assert_fail
("!Elts.empty()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4559, __PRETTY_FUNCTION__))
;
4560 MVT Type;
4561 unsigned NumElts;
4562
4563 if (Elts.size() == 1) {
4564 Type = MVT::f32;
4565 NumElts = 1;
4566 } else if (Elts.size() == 2) {
4567 Type = MVT::v2f32;
4568 NumElts = 2;
4569 } else if (Elts.size() <= 4) {
4570 Type = MVT::v4f32;
4571 NumElts = 4;
4572 } else if (Elts.size() <= 8) {
4573 Type = MVT::v8f32;
4574 NumElts = 8;
4575 } else {
4576 assert(Elts.size() <= 16)((Elts.size() <= 16) ? static_cast<void> (0) : __assert_fail
("Elts.size() <= 16", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4576, __PRETTY_FUNCTION__))
;
4577 Type = MVT::v16f32;
4578 NumElts = 16;
4579 }
4580
4581 SmallVector<SDValue, 16> VecElts(NumElts);
4582 for (unsigned i = 0; i < Elts.size(); ++i) {
4583 SDValue Elt = Elts[i];
4584 if (Elt.getValueType() != MVT::f32)
4585 Elt = DAG.getBitcast(MVT::f32, Elt);
4586 VecElts[i] = Elt;
4587 }
4588 for (unsigned i = Elts.size(); i < NumElts; ++i)
4589 VecElts[i] = DAG.getUNDEF(MVT::f32);
4590
4591 if (NumElts == 1)
4592 return VecElts[0];
4593 return DAG.getBuildVector(Type, DL, VecElts);
4594}
4595
4596static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4597 SDValue *GLC, SDValue *SLC) {
4598 auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
4599 if (!CachePolicyConst)
4600 return false;
4601
4602 uint64_t Value = CachePolicyConst->getZExtValue();
4603 SDLoc DL(CachePolicy);
4604 if (GLC) {
4605 *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4606 Value &= ~(uint64_t)0x1;
4607 }
4608 if (SLC) {
4609 *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4610 Value &= ~(uint64_t)0x2;
4611 }
4612
4613 return Value == 0;
4614}
4615
4616SDValue SITargetLowering::lowerImage(SDValue Op,
4617 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4618 SelectionDAG &DAG) const {
4619 SDLoc DL(Op);
4620 MachineFunction &MF = DAG.getMachineFunction();
4621 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
4622 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4623 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4624 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
4625 const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4626 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4627 unsigned IntrOpcode = Intr->BaseOpcode;
4628
4629 SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
4630 bool IsD16 = false;
4631 bool IsA16 = false;
4632 SDValue VData;
4633 int NumVDataDwords;
4634 unsigned AddrIdx; // Index of first address argument
4635 unsigned DMask;
4636
4637 if (BaseOpcode->Atomic) {
4638 VData = Op.getOperand(2);
4639
4640 bool Is64Bit = VData.getValueType() == MVT::i64;
4641 if (BaseOpcode->AtomicX2) {
4642 SDValue VData2 = Op.getOperand(3);
4643 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4644 {VData, VData2});
4645 if (Is64Bit)
4646 VData = DAG.getBitcast(MVT::v4i32, VData);
4647
4648 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4649 DMask = Is64Bit ? 0xf : 0x3;
4650 NumVDataDwords = Is64Bit ? 4 : 2;
4651 AddrIdx = 4;
4652 } else {
4653 DMask = Is64Bit ? 0x3 : 0x1;
4654 NumVDataDwords = Is64Bit ? 2 : 1;
4655 AddrIdx = 3;
4656 }
4657 } else {
4658 unsigned DMaskIdx;
4659
4660 if (BaseOpcode->Store) {
4661 VData = Op.getOperand(2);
4662
4663 MVT StoreVT = VData.getSimpleValueType();
4664 if (StoreVT.getScalarType() == MVT::f16) {
4665 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4666 !BaseOpcode->HasD16)
4667 return Op; // D16 is unsupported for this instruction
4668
4669 IsD16 = true;
4670 VData = handleD16VData(VData, DAG);
4671 }
4672
4673 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
4674 DMaskIdx = 3;
4675 } else {
4676 MVT LoadVT = Op.getSimpleValueType();
4677 if (LoadVT.getScalarType() == MVT::f16) {
4678 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4679 !BaseOpcode->HasD16)
4680 return Op; // D16 is unsupported for this instruction
4681
4682 IsD16 = true;
4683 if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
4684 ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
4685 }
4686
4687 NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
4688 DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
4689 }
4690
4691 auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
4692 if (!DMaskConst)
4693 return Op;
4694
4695 AddrIdx = DMaskIdx + 1;
4696 DMask = DMaskConst->getZExtValue();
4697 if (!DMask && !BaseOpcode->Store) {
4698 // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
4699 // store the channels' default values.
4700 SDValue Undef = DAG.getUNDEF(Op.getValueType());
4701 if (isa<MemSDNode>(Op))
4702 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
4703 return Undef;
4704 }
4705 }
4706
4707 unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4708 unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4709 unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4710 unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4711 NumCoords + NumLCM;
4712 unsigned NumMIVAddrs = NumVAddrs;
4713
4714 SmallVector<SDValue, 4> VAddrs;
4715
4716 // Optimize _L to _LZ when _L is zero
4717 if (LZMappingInfo) {
4718 if (auto ConstantLod =
4719 dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
4720 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4721 IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
4722 NumMIVAddrs--; // remove 'lod'
4723 }
4724 }
4725 }
4726
4727 // Check for 16 bit addresses and pack if true.
4728 unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4729 MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
4730 if (VAddrVT.getScalarType() == MVT::f16 &&
4731 ST->hasFeature(AMDGPU::FeatureR128A16)) {
4732 IsA16 = true;
4733 for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4734 SDValue AddrLo, AddrHi;
4735 // Push back extra arguments.
4736 if (i < DimIdx) {
4737 AddrLo = Op.getOperand(i);
4738 } else {
4739 AddrLo = Op.getOperand(i);
4740 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4741 // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4742 if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
4743 ((NumGradients / 2) % 2 == 1 &&
4744 (i == DimIdx + (NumGradients / 2) - 1 ||
4745 i == DimIdx + NumGradients - 1))) {
4746 AddrHi = DAG.getUNDEF(MVT::f16);
4747 } else {
4748 AddrHi = Op.getOperand(i + 1);
4749 i++;
4750 }
4751 AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f16,
4752 {AddrLo, AddrHi});
4753 AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4754 }
4755 VAddrs.push_back(AddrLo);
4756 }
4757 } else {
4758 for (unsigned i = 0; i < NumMIVAddrs; ++i)
4759 VAddrs.push_back(Op.getOperand(AddrIdx + i));
4760 }
4761
4762 SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
4763
4764 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4765 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4766 unsigned CtrlIdx; // Index of texfailctrl argument
4767 SDValue Unorm;
4768 if (!BaseOpcode->Sampler) {
4769 Unorm = True;
4770 CtrlIdx = AddrIdx + NumVAddrs + 1;
4771 } else {
4772 auto UnormConst =
4773 dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
4774 if (!UnormConst)
4775 return Op;
4776
4777 Unorm = UnormConst->getZExtValue() ? True : False;
4778 CtrlIdx = AddrIdx + NumVAddrs + 3;
4779 }
4780
4781 SDValue TexFail = Op.getOperand(CtrlIdx);
4782 auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
4783 if (!TexFailConst || TexFailConst->getZExtValue() != 0)
4784 return Op;
4785
4786 SDValue GLC;
4787 SDValue SLC;
4788 if (BaseOpcode->Atomic) {
4789 GLC = True; // TODO no-return optimization
4790 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
4791 return Op;
4792 } else {
4793 if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
4794 return Op;
4795 }
4796
4797 SmallVector<SDValue, 14> Ops;
4798 if (BaseOpcode->Store || BaseOpcode->Atomic)
4799 Ops.push_back(VData); // vdata
4800 Ops.push_back(VAddr);
4801 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
4802 if (BaseOpcode->Sampler)
4803 Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
4804 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
4805 Ops.push_back(Unorm);
4806 Ops.push_back(GLC);
4807 Ops.push_back(SLC);
4808 Ops.push_back(IsA16 && // a16 or r128
4809 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
4810 Ops.push_back(False); // tfe
4811 Ops.push_back(False); // lwe
4812 Ops.push_back(DimInfo->DA ? True : False);
4813 if (BaseOpcode->HasD16)
4814 Ops.push_back(IsD16 ? True : False);
4815 if (isa<MemSDNode>(Op))
4816 Ops.push_back(Op.getOperand(0)); // chain
4817
4818 int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
4819 int Opcode = -1;
4820
4821 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4822 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
4823 NumVDataDwords, NumVAddrDwords);
4824 if (Opcode == -1)
4825 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
4826 NumVDataDwords, NumVAddrDwords);
4827 assert(Opcode != -1)((Opcode != -1) ? static_cast<void> (0) : __assert_fail
("Opcode != -1", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4827, __PRETTY_FUNCTION__))
;
4828
4829 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
4830 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
4831 MachineMemOperand *MemRef = MemOp->getMemOperand();
4832 DAG.setNodeMemRefs(NewNode, {MemRef});
4833 }
4834
4835 if (BaseOpcode->AtomicX2) {
4836 SmallVector<SDValue, 1> Elt;
4837 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
4838 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
4839 } else if (IsD16 && !BaseOpcode->Store) {
4840 MVT LoadVT = Op.getSimpleValueType();
4841 SDValue Adjusted = adjustLoadValueTypeImpl(
4842 SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
4843 return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
4844 }
4845
4846 return SDValue(NewNode, 0);
4847}
4848
4849SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
4850 SDValue Offset, SDValue GLC,
4851 SelectionDAG &DAG) const {
4852 MachineFunction &MF = DAG.getMachineFunction();
4853 MachineMemOperand *MMO = MF.getMachineMemOperand(
4854 MachinePointerInfo(),
4855 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4856 MachineMemOperand::MOInvariant,
4857 VT.getStoreSize(), VT.getStoreSize());
4858
4859 if (!Offset->isDivergent()) {
4860 SDValue Ops[] = {
4861 Rsrc,
4862 Offset, // Offset
4863 GLC // glc
4864 };
4865 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
4866 DAG.getVTList(VT), Ops, VT, MMO);
4867 }
4868
4869 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
4870 // assume that the buffer is unswizzled.
4871 SmallVector<SDValue, 4> Loads;
4872 unsigned NumLoads = 1;
4873 MVT LoadVT = VT.getSimpleVT();
4874
4875 assert(LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 ||((LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT
::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32) ? static_cast
<void> (0) : __assert_fail ("LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4876, __PRETTY_FUNCTION__))
4876 LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32)((LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT
::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32) ? static_cast
<void> (0) : __assert_fail ("LoadVT == MVT::i32 || LoadVT == MVT::v2i32 || LoadVT == MVT::v4i32 || LoadVT == MVT::v8i32 || LoadVT == MVT::v16i32"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 4876, __PRETTY_FUNCTION__))
;
4877
4878 if (VT == MVT::v8i32 || VT == MVT::v16i32) {
4879 NumLoads = VT == MVT::v16i32 ? 4 : 2;
4880 LoadVT = MVT::v4i32;
4881 }
4882
4883 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
4884 unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
4885 SDValue Ops[] = {
4886 DAG.getEntryNode(), // Chain
4887 Rsrc, // rsrc
4888 DAG.getConstant(0, DL, MVT::i32), // vindex
4889 {}, // voffset
4890 {}, // soffset
4891 {}, // offset
4892 DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
4893 DAG.getConstant(0, DL, MVT::i1), // idxen
4894 };
4895
4896 // Use the alignment to ensure that the required offsets will fit into the
4897 // immediate offsets.
4898 setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
4899
4900 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
4901 for (unsigned i = 0; i < NumLoads; ++i) {
4902 Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
4903 Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
4904 Ops, LoadVT, MMO));
4905 }
4906
4907 if (VT == MVT::v8i32 || VT == MVT::v16i32)
4908 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
4909
4910 return Loads[0];
4911}
4912
4913SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4914 SelectionDAG &DAG) const {
4915 MachineFunction &MF = DAG.getMachineFunction();
4916 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
4917
4918 EVT VT = Op.getValueType();
4919 SDLoc DL(Op);
4920 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4921
4922 // TODO: Should this propagate fast-math-flags?
4923
4924 switch (IntrinsicID) {
4925 case Intrinsic::amdgcn_implicit_buffer_ptr: {
4926 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
4927 return emitNonHSAIntrinsicError(DAG, DL, VT);
4928 return getPreloadedValue(DAG, *MFI, VT,
4929 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4930 }
4931 case Intrinsic::amdgcn_dispatch_ptr:
4932 case Intrinsic::amdgcn_queue_ptr: {
4933 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
4934 DiagnosticInfoUnsupported BadIntrin(
4935 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
4936 DL.getDebugLoc());
4937 DAG.getContext()->diagnose(BadIntrin);
4938 return DAG.getUNDEF(VT);
4939 }
4940
4941 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
4942 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
4943 return getPreloadedValue(DAG, *MFI, VT, RegID);
4944 }
4945 case Intrinsic::amdgcn_implicitarg_ptr: {
4946 if (MFI->isEntryFunction())
4947 return getImplicitArgPtr(DAG, DL);
4948 return getPreloadedValue(DAG, *MFI, VT,
4949 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4950 }
4951 case Intrinsic::amdgcn_kernarg_segment_ptr: {
4952 return getPreloadedValue(DAG, *MFI, VT,
4953 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4954 }
4955 case Intrinsic::amdgcn_dispatch_id: {
4956 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
4957 }
4958 case Intrinsic::amdgcn_rcp:
4959 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
4960 case Intrinsic::amdgcn_rsq:
4961 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4962 case Intrinsic::amdgcn_rsq_legacy:
4963 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4964 return emitRemovedIntrinsicError(DAG, DL, VT);
4965
4966 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
4967 case Intrinsic::amdgcn_rcp_legacy:
4968 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
4969 return emitRemovedIntrinsicError(DAG, DL, VT);
4970 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
4971 case Intrinsic::amdgcn_rsq_clamp: {
4972 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4973 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
4974
4975 Type *Type = VT.getTypeForEVT(*DAG.getContext());
4976 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
4977 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
4978
4979 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
4980 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
4981 DAG.getConstantFP(Max, DL, VT));
4982 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
4983 DAG.getConstantFP(Min, DL, VT));
4984 }
4985 case Intrinsic::r600_read_ngroups_x:
4986 if (Subtarget->isAmdHsaOS())
4987 return emitNonHSAIntrinsicError(DAG, DL, VT);
4988
4989 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4990 SI::KernelInputOffsets::NGROUPS_X, 4, false);
4991 case Intrinsic::r600_read_ngroups_y:
4992 if (Subtarget->isAmdHsaOS())
4993 return emitNonHSAIntrinsicError(DAG, DL, VT);
4994
4995 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
4996 SI::KernelInputOffsets::NGROUPS_Y, 4, false);
4997 case Intrinsic::r600_read_ngroups_z:
4998 if (Subtarget->isAmdHsaOS())
4999 return emitNonHSAIntrinsicError(DAG, DL, VT);
5000
5001 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5002 SI::KernelInputOffsets::NGROUPS_Z, 4, false);
5003 case Intrinsic::r600_read_global_size_x:
5004 if (Subtarget->isAmdHsaOS())
5005 return emitNonHSAIntrinsicError(DAG, DL, VT);
5006
5007 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5008 SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
5009 case Intrinsic::r600_read_global_size_y:
5010 if (Subtarget->isAmdHsaOS())
5011 return emitNonHSAIntrinsicError(DAG, DL, VT);
5012
5013 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5014 SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
5015 case Intrinsic::r600_read_global_size_z:
5016 if (Subtarget->isAmdHsaOS())
5017 return emitNonHSAIntrinsicError(DAG, DL, VT);
5018
5019 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5020 SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
5021 case Intrinsic::r600_read_local_size_x:
5022 if (Subtarget->isAmdHsaOS())
5023 return emitNonHSAIntrinsicError(DAG, DL, VT);
5024
5025 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5026 SI::KernelInputOffsets::LOCAL_SIZE_X);
5027 case Intrinsic::r600_read_local_size_y:
5028 if (Subtarget->isAmdHsaOS())
5029 return emitNonHSAIntrinsicError(DAG, DL, VT);
5030
5031 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5032 SI::KernelInputOffsets::LOCAL_SIZE_Y);
5033 case Intrinsic::r600_read_local_size_z:
5034 if (Subtarget->isAmdHsaOS())
5035 return emitNonHSAIntrinsicError(DAG, DL, VT);
5036
5037 return lowerImplicitZextParam(DAG, Op, MVT::i16,
5038 SI::KernelInputOffsets::LOCAL_SIZE_Z);
5039 case Intrinsic::amdgcn_workgroup_id_x:
5040 case Intrinsic::r600_read_tgid_x:
5041 return getPreloadedValue(DAG, *MFI, VT,
5042 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5043 case Intrinsic::amdgcn_workgroup_id_y:
5044 case Intrinsic::r600_read_tgid_y:
5045 return getPreloadedValue(DAG, *MFI, VT,
5046 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5047 case Intrinsic::amdgcn_workgroup_id_z:
5048 case Intrinsic::r600_read_tgid_z:
5049 return getPreloadedValue(DAG, *MFI, VT,
5050 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5051 case Intrinsic::amdgcn_workitem_id_x: {
5052 case Intrinsic::r600_read_tidig_x:
5053 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5054 SDLoc(DAG.getEntryNode()),
5055 MFI->getArgInfo().WorkItemIDX);
5056 }
5057 case Intrinsic::amdgcn_workitem_id_y:
5058 case Intrinsic::r600_read_tidig_y:
5059 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5060 SDLoc(DAG.getEntryNode()),
5061 MFI->getArgInfo().WorkItemIDY);
5062 case Intrinsic::amdgcn_workitem_id_z:
5063 case Intrinsic::r600_read_tidig_z:
5064 return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5065 SDLoc(DAG.getEntryNode()),
5066 MFI->getArgInfo().WorkItemIDZ);
5067 case AMDGPUIntrinsic::SI_load_const: {
5068 SDValue Load =
5069 lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2),
5070 DAG.getTargetConstant(0, DL, MVT::i1), DAG);
5071 return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load);
5072 }
5073 case Intrinsic::amdgcn_s_buffer_load: {
5074 unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
5075 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5076 DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
5077 }
5078 case Intrinsic::amdgcn_fdiv_fast:
5079 return lowerFDIV_FAST(Op, DAG);
5080 case Intrinsic::amdgcn_interp_mov: {
5081 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5082 SDValue Glue = M0.getValue(1);
5083 return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5084 Op.getOperand(2), Op.getOperand(3), Glue);
5085 }
5086 case Intrinsic::amdgcn_interp_p1: {
5087 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5088 SDValue Glue = M0.getValue(1);
5089 return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5090 Op.getOperand(2), Op.getOperand(3), Glue);
5091 }
5092 case Intrinsic::amdgcn_interp_p2: {
5093 SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5094 SDValue Glue = SDValue(M0.getNode(), 1);
5095 return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5096 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5097 Glue);
5098 }
5099 case Intrinsic::amdgcn_sin:
5100 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5101
5102 case Intrinsic::amdgcn_cos:
5103 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5104
5105 case Intrinsic::amdgcn_log_clamp: {
5106 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5107 return SDValue();
5108
5109 DiagnosticInfoUnsupported BadIntrin(
5110 MF.getFunction(), "intrinsic not supported on subtarget",
5111 DL.getDebugLoc());
5112 DAG.getContext()->diagnose(BadIntrin);
5113 return DAG.getUNDEF(VT);
5114 }
5115 case Intrinsic::amdgcn_ldexp:
5116 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5117 Op.getOperand(1), Op.getOperand(2));
5118
5119 case Intrinsic::amdgcn_fract:
5120 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5121
5122 case Intrinsic::amdgcn_class:
5123 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5124 Op.getOperand(1), Op.getOperand(2));
5125 case Intrinsic::amdgcn_div_fmas:
5126 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5127 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5128 Op.getOperand(4));
5129
5130 case Intrinsic::amdgcn_div_fixup:
5131 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5132 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5133
5134 case Intrinsic::amdgcn_trig_preop:
5135 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5136 Op.getOperand(1), Op.getOperand(2));
5137 case Intrinsic::amdgcn_div_scale: {
5138 // 3rd parameter required to be a constant.
5139 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
5140 if (!Param)
5141 return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
5142
5143 // Translate to the operands expected by the machine instruction. The
5144 // first parameter must be the same as the first instruction.
5145 SDValue Numerator = Op.getOperand(1);
5146 SDValue Denominator = Op.getOperand(2);
5147
5148 // Note this order is opposite of the machine instruction's operations,
5149 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5150 // intrinsic has the numerator as the first operand to match a normal
5151 // division operation.
5152
5153 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5154
5155 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5156 Denominator, Numerator);
5157 }
5158 case Intrinsic::amdgcn_icmp: {
5159 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
5160 }
5161 case Intrinsic::amdgcn_fcmp: {
5162 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
5163 }
5164 case Intrinsic::amdgcn_fmed3:
5165 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5166 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5167 case Intrinsic::amdgcn_fdot2:
5168 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
5169 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5170 Op.getOperand(4));
5171 case Intrinsic::amdgcn_fmul_legacy:
5172 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5173 Op.getOperand(1), Op.getOperand(2));
5174 case Intrinsic::amdgcn_sffbh:
5175 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
5176 case Intrinsic::amdgcn_sbfe:
5177 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5178 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5179 case Intrinsic::amdgcn_ubfe:
5180 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5181 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5182 case Intrinsic::amdgcn_cvt_pkrtz:
5183 case Intrinsic::amdgcn_cvt_pknorm_i16:
5184 case Intrinsic::amdgcn_cvt_pknorm_u16:
5185 case Intrinsic::amdgcn_cvt_pk_i16:
5186 case Intrinsic::amdgcn_cvt_pk_u16: {
5187 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
5188 EVT VT = Op.getValueType();
5189 unsigned Opcode;
5190
5191 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5192 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5193 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5194 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5195 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5196 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5197 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5198 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5199 else
5200 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5201
5202 if (isTypeLegal(VT))
5203 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5204
5205 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
5206 Op.getOperand(1), Op.getOperand(2));
5207 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5208 }
5209 case Intrinsic::amdgcn_wqm: {
5210 SDValue Src = Op.getOperand(1);
5211 return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5212 0);
5213 }
5214 case Intrinsic::amdgcn_wwm: {
5215 SDValue Src = Op.getOperand(1);
5216 return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5217 0);
5218 }
5219 case Intrinsic::amdgcn_fmad_ftz:
5220 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5221 Op.getOperand(2), Op.getOperand(3));
5222 default:
5223 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5224 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5225 return lowerImage(Op, ImageDimIntr, DAG);
5226
5227 return Op;
5228 }
5229}
5230
5231SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5232 SelectionDAG &DAG) const {
5233 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5234 SDLoc DL(Op);
5235
5236 switch (IntrID) {
5237 case Intrinsic::amdgcn_atomic_inc:
5238 case Intrinsic::amdgcn_atomic_dec:
5239 case Intrinsic::amdgcn_ds_fadd:
5240 case Intrinsic::amdgcn_ds_fmin:
5241 case Intrinsic::amdgcn_ds_fmax: {
5242 MemSDNode *M = cast<MemSDNode>(Op);
5243 unsigned Opc;
5244 switch (IntrID) {
5245 case Intrinsic::amdgcn_atomic_inc:
5246 Opc = AMDGPUISD::ATOMIC_INC;
5247 break;
5248 case Intrinsic::amdgcn_atomic_dec:
5249 Opc = AMDGPUISD::ATOMIC_DEC;
5250 break;
5251 case Intrinsic::amdgcn_ds_fadd:
5252 Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
5253 break;
5254 case Intrinsic::amdgcn_ds_fmin:
5255 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5256 break;
5257 case Intrinsic::amdgcn_ds_fmax:
5258 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5259 break;
5260 default:
5261 llvm_unreachable("Unknown intrinsic!")::llvm::llvm_unreachable_internal("Unknown intrinsic!", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5261)
;
5262 }
5263 SDValue Ops[] = {
5264 M->getOperand(0), // Chain
5265 M->getOperand(2), // Ptr
5266 M->getOperand(3) // Value
5267 };
5268
5269 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5270 M->getMemoryVT(), M->getMemOperand());
5271 }
5272 case Intrinsic::amdgcn_buffer_load:
5273 case Intrinsic::amdgcn_buffer_load_format: {
5274 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5275 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5276 unsigned IdxEn = 1;
5277 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5278 IdxEn = Idx->getZExtValue() != 0;
5279 SDValue Ops[] = {
5280 Op.getOperand(0), // Chain
5281 Op.getOperand(2), // rsrc
5282 Op.getOperand(3), // vindex
5283 SDValue(), // voffset -- will be set by setBufferOffsets
5284 SDValue(), // soffset -- will be set by setBufferOffsets
5285 SDValue(), // offset -- will be set by setBufferOffsets
5286 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5287 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5288 };
5289
5290 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
5291 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5292 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5293
5294 EVT VT = Op.getValueType();
5295 EVT IntVT = VT.changeTypeToInteger();
5296 auto *M = cast<MemSDNode>(Op);
5297 EVT LoadVT = Op.getValueType();
5298
5299 if (LoadVT.getScalarType() == MVT::f16)
5300 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5301 M, DAG, Ops);
5302 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5303 M->getMemOperand());
5304 }
5305 case Intrinsic::amdgcn_raw_buffer_load:
5306 case Intrinsic::amdgcn_raw_buffer_load_format: {
5307 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5308 SDValue Ops[] = {
5309 Op.getOperand(0), // Chain
5310 Op.getOperand(2), // rsrc
5311 DAG.getConstant(0, DL, MVT::i32), // vindex
5312 Offsets.first, // voffset
5313 Op.getOperand(4), // soffset
5314 Offsets.second, // offset
5315 Op.getOperand(5), // cachepolicy
5316 DAG.getConstant(0, DL, MVT::i1), // idxen
5317 };
5318
5319 unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5320 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5321
5322 EVT VT = Op.getValueType();
5323 EVT IntVT = VT.changeTypeToInteger();
5324 auto *M = cast<MemSDNode>(Op);
5325 EVT LoadVT = Op.getValueType();
5326
5327 if (LoadVT.getScalarType() == MVT::f16)
5328 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5329 M, DAG, Ops);
5330 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5331 M->getMemOperand());
5332 }
5333 case Intrinsic::amdgcn_struct_buffer_load:
5334 case Intrinsic::amdgcn_struct_buffer_load_format: {
5335 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5336 SDValue Ops[] = {
5337 Op.getOperand(0), // Chain
5338 Op.getOperand(2), // rsrc
5339 Op.getOperand(3), // vindex
5340 Offsets.first, // voffset
5341 Op.getOperand(5), // soffset
5342 Offsets.second, // offset
5343 Op.getOperand(6), // cachepolicy
5344 DAG.getConstant(1, DL, MVT::i1), // idxen
5345 };
5346
5347 unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5348 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5349
5350 EVT VT = Op.getValueType();
5351 EVT IntVT = VT.changeTypeToInteger();
5352 auto *M = cast<MemSDNode>(Op);
5353 EVT LoadVT = Op.getValueType();
5354
5355 if (LoadVT.getScalarType() == MVT::f16)
5356 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5357 M, DAG, Ops);
5358 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5359 M->getMemOperand());
5360 }
5361 case Intrinsic::amdgcn_tbuffer_load: {
5362 MemSDNode *M = cast<MemSDNode>(Op);
5363 EVT LoadVT = Op.getValueType();
5364
5365 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5366 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5367 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5368 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5369 unsigned IdxEn = 1;
5370 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5371 IdxEn = Idx->getZExtValue() != 0;
5372 SDValue Ops[] = {
5373 Op.getOperand(0), // Chain
5374 Op.getOperand(2), // rsrc
5375 Op.getOperand(3), // vindex
5376 Op.getOperand(4), // voffset
5377 Op.getOperand(5), // soffset
5378 Op.getOperand(6), // offset
5379 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5380 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5381 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5382 };
5383
5384 if (LoadVT.getScalarType() == MVT::f16)
5385 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5386 M, DAG, Ops);
5387 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5388 Op->getVTList(), Ops, LoadVT,
5389 M->getMemOperand());
5390 }
5391 case Intrinsic::amdgcn_raw_tbuffer_load: {
5392 MemSDNode *M = cast<MemSDNode>(Op);
5393 EVT LoadVT = Op.getValueType();
5394 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5395
5396 SDValue Ops[] = {
5397 Op.getOperand(0), // Chain
5398 Op.getOperand(2), // rsrc
5399 DAG.getConstant(0, DL, MVT::i32), // vindex
5400 Offsets.first, // voffset
5401 Op.getOperand(4), // soffset
5402 Offsets.second, // offset
5403 Op.getOperand(5), // format
5404 Op.getOperand(6), // cachepolicy
5405 DAG.getConstant(0, DL, MVT::i1), // idxen
5406 };
5407
5408 if (LoadVT.getScalarType() == MVT::f16)
5409 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5410 M, DAG, Ops);
5411 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5412 Op->getVTList(), Ops, LoadVT,
5413 M->getMemOperand());
5414 }
5415 case Intrinsic::amdgcn_struct_tbuffer_load: {
5416 MemSDNode *M = cast<MemSDNode>(Op);
5417 EVT LoadVT = Op.getValueType();
5418 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5419
5420 SDValue Ops[] = {
5421 Op.getOperand(0), // Chain
5422 Op.getOperand(2), // rsrc
5423 Op.getOperand(3), // vindex
5424 Offsets.first, // voffset
5425 Op.getOperand(5), // soffset
5426 Offsets.second, // offset
5427 Op.getOperand(6), // format
5428 Op.getOperand(7), // cachepolicy
5429 DAG.getConstant(1, DL, MVT::i1), // idxen
5430 };
5431
5432 if (LoadVT.getScalarType() == MVT::f16)
5433 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5434 M, DAG, Ops);
5435 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5436 Op->getVTList(), Ops, LoadVT,
5437 M->getMemOperand());
5438 }
5439 case Intrinsic::amdgcn_buffer_atomic_swap:
5440 case Intrinsic::amdgcn_buffer_atomic_add:
5441 case Intrinsic::amdgcn_buffer_atomic_sub:
5442 case Intrinsic::amdgcn_buffer_atomic_smin:
5443 case Intrinsic::amdgcn_buffer_atomic_umin:
5444 case Intrinsic::amdgcn_buffer_atomic_smax:
5445 case Intrinsic::amdgcn_buffer_atomic_umax:
5446 case Intrinsic::amdgcn_buffer_atomic_and:
5447 case Intrinsic::amdgcn_buffer_atomic_or:
5448 case Intrinsic::amdgcn_buffer_atomic_xor: {
5449 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5450 unsigned IdxEn = 1;
5451 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5452 IdxEn = Idx->getZExtValue() != 0;
5453 SDValue Ops[] = {
5454 Op.getOperand(0), // Chain
5455 Op.getOperand(2), // vdata
5456 Op.getOperand(3), // rsrc
5457 Op.getOperand(4), // vindex
5458 SDValue(), // voffset -- will be set by setBufferOffsets
5459 SDValue(), // soffset -- will be set by setBufferOffsets
5460 SDValue(), // offset -- will be set by setBufferOffsets
5461 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5462 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5463 };
5464 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5465 EVT VT = Op.getValueType();
5466
5467 auto *M = cast<MemSDNode>(Op);
5468 unsigned Opcode = 0;
5469
5470 switch (IntrID) {
5471 case Intrinsic::amdgcn_buffer_atomic_swap:
5472 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5473 break;
5474 case Intrinsic::amdgcn_buffer_atomic_add:
5475 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5476 break;
5477 case Intrinsic::amdgcn_buffer_atomic_sub:
5478 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5479 break;
5480 case Intrinsic::amdgcn_buffer_atomic_smin:
5481 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5482 break;
5483 case Intrinsic::amdgcn_buffer_atomic_umin:
5484 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5485 break;
5486 case Intrinsic::amdgcn_buffer_atomic_smax:
5487 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5488 break;
5489 case Intrinsic::amdgcn_buffer_atomic_umax:
5490 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5491 break;
5492 case Intrinsic::amdgcn_buffer_atomic_and:
5493 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5494 break;
5495 case Intrinsic::amdgcn_buffer_atomic_or:
5496 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5497 break;
5498 case Intrinsic::amdgcn_buffer_atomic_xor:
5499 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5500 break;
5501 default:
5502 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5502)
;
5503 }
5504
5505 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5506 M->getMemOperand());
5507 }
5508 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5509 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5510 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5511 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5512 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5513 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5514 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5515 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5516 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5517 case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5518 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5519 SDValue Ops[] = {
5520 Op.getOperand(0), // Chain
5521 Op.getOperand(2), // vdata
5522 Op.getOperand(3), // rsrc
5523 DAG.getConstant(0, DL, MVT::i32), // vindex
5524 Offsets.first, // voffset
5525 Op.getOperand(5), // soffset
5526 Offsets.second, // offset
5527 Op.getOperand(6), // cachepolicy
5528 DAG.getConstant(0, DL, MVT::i1), // idxen
5529 };
5530 EVT VT = Op.getValueType();
5531
5532 auto *M = cast<MemSDNode>(Op);
5533 unsigned Opcode = 0;
5534
5535 switch (IntrID) {
5536 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5537 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5538 break;
5539 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5540 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5541 break;
5542 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5543 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5544 break;
5545 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5546 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5547 break;
5548 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5549 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5550 break;
5551 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5552 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5553 break;
5554 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5555 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5556 break;
5557 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5558 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5559 break;
5560 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5561 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5562 break;
5563 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5564 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5565 break;
5566 default:
5567 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5567)
;
5568 }
5569
5570 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5571 M->getMemOperand());
5572 }
5573 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5574 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5575 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5576 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5577 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5578 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5579 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5580 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5581 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5582 case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5583 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5584 SDValue Ops[] = {
5585 Op.getOperand(0), // Chain
5586 Op.getOperand(2), // vdata
5587 Op.getOperand(3), // rsrc
5588 Op.getOperand(4), // vindex
5589 Offsets.first, // voffset
5590 Op.getOperand(6), // soffset
5591 Offsets.second, // offset
5592 Op.getOperand(7), // cachepolicy
5593 DAG.getConstant(1, DL, MVT::i1), // idxen
5594 };
5595 EVT VT = Op.getValueType();
5596
5597 auto *M = cast<MemSDNode>(Op);
5598 unsigned Opcode = 0;
5599
5600 switch (IntrID) {
5601 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5602 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5603 break;
5604 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5605 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5606 break;
5607 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5608 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5609 break;
5610 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5611 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5612 break;
5613 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5614 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5615 break;
5616 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5617 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5618 break;
5619 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5620 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5621 break;
5622 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5623 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5624 break;
5625 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5626 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5627 break;
5628 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5629 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5630 break;
5631 default:
5632 llvm_unreachable("unhandled atomic opcode")::llvm::llvm_unreachable_internal("unhandled atomic opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5632)
;
5633 }
5634
5635 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5636 M->getMemOperand());
5637 }
5638 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
5639 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5640 unsigned IdxEn = 1;
5641 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
5642 IdxEn = Idx->getZExtValue() != 0;
5643 SDValue Ops[] = {
5644 Op.getOperand(0), // Chain
5645 Op.getOperand(2), // src
5646 Op.getOperand(3), // cmp
5647 Op.getOperand(4), // rsrc
5648 Op.getOperand(5), // vindex
5649 SDValue(), // voffset -- will be set by setBufferOffsets
5650 SDValue(), // soffset -- will be set by setBufferOffsets
5651 SDValue(), // offset -- will be set by setBufferOffsets
5652 DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5653 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5654 };
5655 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
5656 EVT VT = Op.getValueType();
5657 auto *M = cast<MemSDNode>(Op);
5658
5659 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5660 Op->getVTList(), Ops, VT, M->getMemOperand());
5661 }
5662 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
5663 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5664 SDValue Ops[] = {
5665 Op.getOperand(0), // Chain
5666 Op.getOperand(2), // src
5667 Op.getOperand(3), // cmp
5668 Op.getOperand(4), // rsrc
5669 DAG.getConstant(0, DL, MVT::i32), // vindex
5670 Offsets.first, // voffset
5671 Op.getOperand(6), // soffset
5672 Offsets.second, // offset
5673 Op.getOperand(7), // cachepolicy
5674 DAG.getConstant(0, DL, MVT::i1), // idxen
5675 };
5676 EVT VT = Op.getValueType();
5677 auto *M = cast<MemSDNode>(Op);
5678
5679 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5680 Op->getVTList(), Ops, VT, M->getMemOperand());
5681 }
5682 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
5683 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
5684 SDValue Ops[] = {
5685 Op.getOperand(0), // Chain
5686 Op.getOperand(2), // src
5687 Op.getOperand(3), // cmp
5688 Op.getOperand(4), // rsrc
5689 Op.getOperand(5), // vindex
5690 Offsets.first, // voffset
5691 Op.getOperand(7), // soffset
5692 Offsets.second, // offset
5693 Op.getOperand(8), // cachepolicy
5694 DAG.getConstant(1, DL, MVT::i1), // idxen
5695 };
5696 EVT VT = Op.getValueType();
5697 auto *M = cast<MemSDNode>(Op);
5698
5699 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
5700 Op->getVTList(), Ops, VT, M->getMemOperand());
5701 }
5702
5703 default:
5704 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5705 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5706 return lowerImage(Op, ImageDimIntr, DAG);
5707
5708 return SDValue();
5709 }
5710}
5711
5712SDValue SITargetLowering::handleD16VData(SDValue VData,
5713 SelectionDAG &DAG) const {
5714 EVT StoreVT = VData.getValueType();
5715
5716 // No change for f16 and legal vector D16 types.
5717 if (!StoreVT.isVector())
5718 return VData;
5719
5720 SDLoc DL(VData);
5721 assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16")(((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"
) ? static_cast<void> (0) : __assert_fail ("(StoreVT.getVectorNumElements() != 3) && \"Handle v3f16\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5721, __PRETTY_FUNCTION__))
;
5722
5723 if (Subtarget->hasUnpackedD16VMem()) {
5724 // We need to unpack the packed data to store.
5725 EVT IntStoreVT = StoreVT.changeTypeToInteger();
5726 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
5727
5728 EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5729 StoreVT.getVectorNumElements());
5730 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
5731 return DAG.UnrollVectorOp(ZExt.getNode());
5732 }
5733
5734 assert(isTypeLegal(StoreVT))((isTypeLegal(StoreVT)) ? static_cast<void> (0) : __assert_fail
("isTypeLegal(StoreVT)", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5734, __PRETTY_FUNCTION__))
;
5735 return VData;
5736}
5737
5738SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5739 SelectionDAG &DAG) const {
5740 SDLoc DL(Op);
5741 SDValue Chain = Op.getOperand(0);
5742 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5743 MachineFunction &MF = DAG.getMachineFunction();
5744
5745 switch (IntrinsicID) {
5746 case Intrinsic::amdgcn_exp: {
5747 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
5748 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
5749 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
5750 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
5751
5752 const SDValue Ops[] = {
5753 Chain,
5754 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
5755 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
5756 Op.getOperand(4), // src0
5757 Op.getOperand(5), // src1
5758 Op.getOperand(6), // src2
5759 Op.getOperand(7), // src3
5760 DAG.getTargetConstant(0, DL, MVT::i1), // compr
5761 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
5762 };
5763
5764 unsigned Opc = Done->isNullValue() ?
5765 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
5766 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
5767 }
5768 case Intrinsic::amdgcn_exp_compr: {
5769 const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
5770 const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
5771 SDValue Src0 = Op.getOperand(4);
5772 SDValue Src1 = Op.getOperand(5);
5773 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
5774 const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
5775
5776 SDValue Undef = DAG.getUNDEF(MVT::f32);
5777 const SDValue Ops[] = {
5778 Chain,
5779 DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
5780 DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
5781 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
5782 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
5783 Undef, // src2
5784 Undef, // src3
5785 DAG.getTargetConstant(1, DL, MVT::i1), // compr
5786 DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
5787 };
5788
5789 unsigned Opc = Done->isNullValue() ?
5790 AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
5791 return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
5792 }
5793 case Intrinsic::amdgcn_s_sendmsg:
5794 case Intrinsic::amdgcn_s_sendmsghalt: {
5795 unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
5796 AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
5797 Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
5798 SDValue Glue = Chain.getValue(1);
5799 return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
5800 Op.getOperand(2), Glue);
5801 }
5802 case Intrinsic::amdgcn_init_exec: {
5803 return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
5804 Op.getOperand(2));
5805 }
5806 case Intrinsic::amdgcn_init_exec_from_input: {
5807 return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
5808 Op.getOperand(2), Op.getOperand(3));
5809 }
5810 case AMDGPUIntrinsic::AMDGPU_kill: {
5811 SDValue Src = Op.getOperand(2);
5812 if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
5813 if (!K->isNegative())
5814 return Chain;
5815
5816 SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
5817 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
5818 }
5819
5820 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
5821 return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
5822 }
5823 case Intrinsic::amdgcn_s_barrier: {
5824 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
5825 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5826 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
5827 if (WGSize <= ST.getWavefrontSize())
5828 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
5829 Op.getOperand(0)), 0);
5830 }
5831 return SDValue();
5832 };
5833 case AMDGPUIntrinsic::SI_tbuffer_store: {
5834
5835 // Extract vindex and voffset from vaddr as appropriate
5836 const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
5837 const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
5838 SDValue VAddr = Op.getOperand(5);
5839
5840 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
5841
5842 assert(!(OffEn->isOne() && IdxEn->isOne()) &&((!(OffEn->isOne() && IdxEn->isOne()) &&
"Legacy intrinsic doesn't support both offset and index - use new version"
) ? static_cast<void> (0) : __assert_fail ("!(OffEn->isOne() && IdxEn->isOne()) && \"Legacy intrinsic doesn't support both offset and index - use new version\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5843, __PRETTY_FUNCTION__))
5843 "Legacy intrinsic doesn't support both offset and index - use new version")((!(OffEn->isOne() && IdxEn->isOne()) &&
"Legacy intrinsic doesn't support both offset and index - use new version"
) ? static_cast<void> (0) : __assert_fail ("!(OffEn->isOne() && IdxEn->isOne()) && \"Legacy intrinsic doesn't support both offset and index - use new version\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5843, __PRETTY_FUNCTION__))
;
5844
5845 SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
5846 SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
5847
5848 // Deal with the vec-3 case
5849 const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
5850 auto Opcode = NumChannels->getZExtValue() == 3 ?
5851 AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
5852
5853 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5854 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5855 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(12))->getZExtValue();
5856 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(13))->getZExtValue();
5857 SDValue Ops[] = {
5858 Chain,
5859 Op.getOperand(3), // vdata
5860 Op.getOperand(2), // rsrc
5861 VIndex,
5862 VOffset,
5863 Op.getOperand(6), // soffset
5864 Op.getOperand(7), // inst_offset
5865 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5866 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5867 DAG.getConstant(IdxEn->isOne(), DL, MVT::i1), // idxen
5868 };
5869
5870 assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&(((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue
() == 0 && "Value of tfe other than zero is unsupported"
) ? static_cast<void> (0) : __assert_fail ("(cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && \"Value of tfe other than zero is unsupported\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5871, __PRETTY_FUNCTION__))
5871 "Value of tfe other than zero is unsupported")(((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue
() == 0 && "Value of tfe other than zero is unsupported"
) ? static_cast<void> (0) : __assert_fail ("(cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && \"Value of tfe other than zero is unsupported\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 5871, __PRETTY_FUNCTION__))
;
5872
5873 EVT VT = Op.getOperand(3).getValueType();
5874 MachineMemOperand *MMO = MF.getMachineMemOperand(
5875 MachinePointerInfo(),
5876 MachineMemOperand::MOStore,
5877 VT.getStoreSize(), 4);
5878 return DAG.getMemIntrinsicNode(Opcode, DL,
5879 Op->getVTList(), Ops, VT, MMO);
5880 }
5881
5882 case Intrinsic::amdgcn_tbuffer_store: {
5883 SDValue VData = Op.getOperand(2);
5884 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5885 if (IsD16)
5886 VData = handleD16VData(VData, DAG);
5887 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5888 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5889 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5890 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
5891 unsigned IdxEn = 1;
5892 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5893 IdxEn = Idx->getZExtValue() != 0;
5894 SDValue Ops[] = {
5895 Chain,
5896 VData, // vdata
5897 Op.getOperand(3), // rsrc
5898 Op.getOperand(4), // vindex
5899 Op.getOperand(5), // voffset
5900 Op.getOperand(6), // soffset
5901 Op.getOperand(7), // offset
5902 DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5903 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5904 DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
5905 };
5906 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5907 AMDGPUISD::TBUFFER_STORE_FORMAT;
5908 MemSDNode *M = cast<MemSDNode>(Op);
5909 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5910 M->getMemoryVT(), M->getMemOperand());
5911 }
5912
5913 case Intrinsic::amdgcn_struct_tbuffer_store: {
5914 SDValue VData = Op.getOperand(2);
5915 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5916 if (IsD16)
5917 VData = handleD16VData(VData, DAG);
5918 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5919 SDValue Ops[] = {
5920 Chain,
5921 VData, // vdata
5922 Op.getOperand(3), // rsrc
5923 Op.getOperand(4), // vindex
5924 Offsets.first, // voffset
5925 Op.getOperand(6), // soffset
5926 Offsets.second, // offset
5927 Op.getOperand(7), // format
5928 Op.getOperand(8), // cachepolicy
5929 DAG.getConstant(1, DL, MVT::i1), // idexen
5930 };
5931 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5932 AMDGPUISD::TBUFFER_STORE_FORMAT;
5933 MemSDNode *M = cast<MemSDNode>(Op);
5934 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5935 M->getMemoryVT(), M->getMemOperand());
5936 }
5937
5938 case Intrinsic::amdgcn_raw_tbuffer_store: {
5939 SDValue VData = Op.getOperand(2);
5940 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5941 if (IsD16)
5942 VData = handleD16VData(VData, DAG);
5943 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5944 SDValue Ops[] = {
5945 Chain,
5946 VData, // vdata
5947 Op.getOperand(3), // rsrc
5948 DAG.getConstant(0, DL, MVT::i32), // vindex
5949 Offsets.first, // voffset
5950 Op.getOperand(5), // soffset
5951 Offsets.second, // offset
5952 Op.getOperand(6), // format
5953 Op.getOperand(7), // cachepolicy
5954 DAG.getConstant(0, DL, MVT::i1), // idexen
5955 };
5956 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
5957 AMDGPUISD::TBUFFER_STORE_FORMAT;
5958 MemSDNode *M = cast<MemSDNode>(Op);
5959 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5960 M->getMemoryVT(), M->getMemOperand());
5961 }
5962
5963 case Intrinsic::amdgcn_buffer_store:
5964 case Intrinsic::amdgcn_buffer_store_format: {
5965 SDValue VData = Op.getOperand(2);
5966 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5967 if (IsD16)
5968 VData = handleD16VData(VData, DAG);
5969 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5970 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5971 unsigned IdxEn = 1;
5972 if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5973 IdxEn = Idx->getZExtValue() != 0;
5974 SDValue Ops[] = {
5975 Chain,
5976 VData,
5977 Op.getOperand(3), // rsrc
5978 Op.getOperand(4), // vindex
5979 SDValue(), // voffset -- will be set by setBufferOffsets
5980 SDValue(), // soffset -- will be set by setBufferOffsets
5981 SDValue(), // offset -- will be set by setBufferOffsets
5982 DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5983 DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5984 };
5985 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5986 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
5987 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
5988 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
5989 MemSDNode *M = cast<MemSDNode>(Op);
5990 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
5991 M->getMemoryVT(), M->getMemOperand());
5992 }
5993
5994 case Intrinsic::amdgcn_raw_buffer_store:
5995 case Intrinsic::amdgcn_raw_buffer_store_format: {
5996 SDValue VData = Op.getOperand(2);
5997 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
5998 if (IsD16)
5999 VData = handleD16VData(VData, DAG);
6000 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6001 SDValue Ops[] = {
6002 Chain,
6003 VData,
6004 Op.getOperand(3), // rsrc
6005 DAG.getConstant(0, DL, MVT::i32), // vindex
6006 Offsets.first, // voffset
6007 Op.getOperand(5), // soffset
6008 Offsets.second, // offset
6009 Op.getOperand(6), // cachepolicy
6010 DAG.getConstant(0, DL, MVT::i1), // idxen
6011 };
6012 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6013 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6014 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6015 MemSDNode *M = cast<MemSDNode>(Op);
6016 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6017 M->getMemoryVT(), M->getMemOperand());
6018 }
6019
6020 case Intrinsic::amdgcn_struct_buffer_store:
6021 case Intrinsic::amdgcn_struct_buffer_store_format: {
6022 SDValue VData = Op.getOperand(2);
6023 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6024 if (IsD16)
6025 VData = handleD16VData(VData, DAG);
6026 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6027 SDValue Ops[] = {
6028 Chain,
6029 VData,
6030 Op.getOperand(3), // rsrc
6031 Op.getOperand(4), // vindex
6032 Offsets.first, // voffset
6033 Op.getOperand(6), // soffset
6034 Offsets.second, // offset
6035 Op.getOperand(7), // cachepolicy
6036 DAG.getConstant(1, DL, MVT::i1), // idxen
6037 };
6038 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6039 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6040 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6041 MemSDNode *M = cast<MemSDNode>(Op);
6042 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6043 M->getMemoryVT(), M->getMemOperand());
6044 }
6045
6046 default: {
6047 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6048 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6049 return lowerImage(Op, ImageDimIntr, DAG);
6050
6051 return Op;
6052 }
6053 }
6054}
6055
6056// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6057// offset (the offset that is included in bounds checking and swizzling, to be
6058// split between the instruction's voffset and immoffset fields) and soffset
6059// (the offset that is excluded from bounds checking and swizzling, to go in
6060// the instruction's soffset field). This function takes the first kind of
6061// offset and figures out how to split it between voffset and immoffset.
6062std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6063 SDValue Offset, SelectionDAG &DAG) const {
6064 SDLoc DL(Offset);
6065 const unsigned MaxImm = 4095;
6066 SDValue N0 = Offset;
6067 ConstantSDNode *C1 = nullptr;
6068 if (N0.getOpcode() == ISD::ADD) {
6069 if ((C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))))
6070 N0 = N0.getOperand(0);
6071 } else if ((C1 = dyn_cast<ConstantSDNode>(N0)))
6072 N0 = SDValue();
6073
6074 if (C1) {
6075 unsigned ImmOffset = C1->getZExtValue();
6076 // If the immediate value is too big for the immoffset field, put the value
6077 // and -4096 into the immoffset field so that the value that is copied/added
6078 // for the voffset field is a multiple of 4096, and it stands more chance
6079 // of being CSEd with the copy/add for another similar load/store.
6080 // However, do not do that rounding down to a multiple of 4096 if that is a
6081 // negative number, as it appears to be illegal to have a negative offset
6082 // in the vgpr, even if adding the immediate offset makes it positive.
6083 unsigned Overflow = ImmOffset & ~MaxImm;
6084 ImmOffset -= Overflow;
6085 if ((int32_t)Overflow < 0) {
6086 Overflow += ImmOffset;
6087 ImmOffset = 0;
6088 }
6089 C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6090 if (Overflow) {
6091 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6092 if (!N0)
6093 N0 = OverflowVal;
6094 else {
6095 SDValue Ops[] = { N0, OverflowVal };
6096 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6097 }
6098 }
6099 }
6100 if (!N0)
6101 N0 = DAG.getConstant(0, DL, MVT::i32);
6102 if (!C1)
6103 C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6104 return {N0, SDValue(C1, 0)};
6105}
6106
6107// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6108// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6109// pointed to by Offsets.
6110void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
6111 SelectionDAG &DAG, SDValue *Offsets,
6112 unsigned Align) const {
6113 SDLoc DL(CombinedOffset);
6114 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6115 uint32_t Imm = C->getZExtValue();
6116 uint32_t SOffset, ImmOffset;
6117 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
6118 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6119 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6120 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6121 return;
6122 }
6123 }
6124 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6125 SDValue N0 = CombinedOffset.getOperand(0);
6126 SDValue N1 = CombinedOffset.getOperand(1);
6127 uint32_t SOffset, ImmOffset;
6128 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
6129 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6130 Subtarget, Align)) {
6131 Offsets[0] = N0;
6132 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6133 Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6134 return;
6135 }
6136 }
6137 Offsets[0] = CombinedOffset;
6138 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6139 Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6140}
6141
6142static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6143 ISD::LoadExtType ExtType, SDValue Op,
6144 const SDLoc &SL, EVT VT) {
6145 if (VT.bitsLT(Op.getValueType()))
6146 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6147
6148 switch (ExtType) {
6149 case ISD::SEXTLOAD:
6150 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6151 case ISD::ZEXTLOAD:
6152 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6153 case ISD::EXTLOAD:
6154 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6155 case ISD::NON_EXTLOAD:
6156 return Op;
6157 }
6158
6159 llvm_unreachable("invalid ext type")::llvm::llvm_unreachable_internal("invalid ext type", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6159)
;
6160}
6161
6162SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6163 SelectionDAG &DAG = DCI.DAG;
6164 if (Ld->getAlignment() < 4 || Ld->isDivergent())
6165 return SDValue();
6166
6167 // FIXME: Constant loads should all be marked invariant.
6168 unsigned AS = Ld->getAddressSpace();
6169 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6170 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6171 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6172 return SDValue();
6173
6174 // Don't do this early, since it may interfere with adjacent load merging for
6175 // illegal types. We can avoid losing alignment information for exotic types
6176 // pre-legalize.
6177 EVT MemVT = Ld->getMemoryVT();
6178 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6179 MemVT.getSizeInBits() >= 32)
6180 return SDValue();
6181
6182 SDLoc SL(Ld);
6183
6184 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&(((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD
) && "unexpected vector extload") ? static_cast<void
> (0) : __assert_fail ("(!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && \"unexpected vector extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6185, __PRETTY_FUNCTION__))
6185 "unexpected vector extload")(((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD
) && "unexpected vector extload") ? static_cast<void
> (0) : __assert_fail ("(!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && \"unexpected vector extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6185, __PRETTY_FUNCTION__))
;
6186
6187 // TODO: Drop only high part of range.
6188 SDValue Ptr = Ld->getBasePtr();
6189 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6190 MVT::i32, SL, Ld->getChain(), Ptr,
6191 Ld->getOffset(),
6192 Ld->getPointerInfo(), MVT::i32,
6193 Ld->getAlignment(),
6194 Ld->getMemOperand()->getFlags(),
6195 Ld->getAAInfo(),
6196 nullptr); // Drop ranges
6197
6198 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6199 if (MemVT.isFloatingPoint()) {
6200 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&((Ld->getExtensionType() == ISD::NON_EXTLOAD && "unexpected fp extload"
) ? static_cast<void> (0) : __assert_fail ("Ld->getExtensionType() == ISD::NON_EXTLOAD && \"unexpected fp extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6201, __PRETTY_FUNCTION__))
6201 "unexpected fp extload")((Ld->getExtensionType() == ISD::NON_EXTLOAD && "unexpected fp extload"
) ? static_cast<void> (0) : __assert_fail ("Ld->getExtensionType() == ISD::NON_EXTLOAD && \"unexpected fp extload\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6201, __PRETTY_FUNCTION__))
;
6202 TruncVT = MemVT.changeTypeToInteger();
6203 }
6204
6205 SDValue Cvt = NewLoad;
6206 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6207 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6208 DAG.getValueType(TruncVT));
6209 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6210 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6211 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6212 } else {
6213 assert(Ld->getExtensionType() == ISD::EXTLOAD)((Ld->getExtensionType() == ISD::EXTLOAD) ? static_cast<
void> (0) : __assert_fail ("Ld->getExtensionType() == ISD::EXTLOAD"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6213, __PRETTY_FUNCTION__))
;
6214 }
6215
6216 EVT VT = Ld->getValueType(0);
6217 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6218
6219 DCI.AddToWorklist(Cvt.getNode());
6220
6221 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6222 // the appropriate extension from the 32-bit load.
6223 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6224 DCI.AddToWorklist(Cvt.getNode());
6225
6226 // Handle conversion back to floating point if necessary.
6227 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6228
6229 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6230}
6231
6232SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6233 SDLoc DL(Op);
6234 LoadSDNode *Load = cast<LoadSDNode>(Op);
6235 ISD::LoadExtType ExtType = Load->getExtensionType();
6236 EVT MemVT = Load->getMemoryVT();
6237
6238 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
6239 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6240 return SDValue();
6241
6242 // FIXME: Copied from PPC
6243 // First, load into 32 bits, then truncate to 1 bit.
6244
6245 SDValue Chain = Load->getChain();
6246 SDValue BasePtr = Load->getBasePtr();
6247 MachineMemOperand *MMO = Load->getMemOperand();
6248
6249 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6250
6251 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
6252 BasePtr, RealMemVT, MMO);
6253
6254 SDValue Ops[] = {
6255 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6256 NewLD.getValue(1)
6257 };
6258
6259 return DAG.getMergeValues(Ops, DL);
6260 }
6261
6262 if (!MemVT.isVector())
6263 return SDValue();
6264
6265 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&((Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented."
) ? static_cast<void> (0) : __assert_fail ("Op.getValueType().getVectorElementType() == MVT::i32 && \"Custom lowering for non-i32 vectors hasn't been implemented.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6266, __PRETTY_FUNCTION__))
6266 "Custom lowering for non-i32 vectors hasn't been implemented.")((Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented."
) ? static_cast<void> (0) : __assert_fail ("Op.getValueType().getVectorElementType() == MVT::i32 && \"Custom lowering for non-i32 vectors hasn't been implemented.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6266, __PRETTY_FUNCTION__))
;
6267
6268 unsigned Alignment = Load->getAlignment();
6269 unsigned AS = Load->getAddressSpace();
6270 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
6271 AS, Alignment)) {
6272 SDValue Ops[2];
6273 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6274 return DAG.getMergeValues(Ops, DL);
6275 }
6276
6277 MachineFunction &MF = DAG.getMachineFunction();
6278 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6279 // If there is a possibilty that flat instruction access scratch memory
6280 // then we need to use the same legalization rules we use for private.
6281 if (AS == AMDGPUAS::FLAT_ADDRESS)
6282 AS = MFI->hasFlatScratchInit() ?
6283 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
6284
6285 unsigned NumElements = MemVT.getVectorNumElements();
6286
6287 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6288 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
6289 if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32)
6290 return SDValue();
6291 // Non-uniform loads will be selected to MUBUF instructions, so they
6292 // have the same legalization requirements as global and private
6293 // loads.
6294 //
6295 }
6296
6297 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6298 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6299 AS == AMDGPUAS::GLOBAL_ADDRESS) {
6300 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
6301 !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
6302 Alignment >= 4 && NumElements < 32)
6303 return SDValue();
6304 // Non-uniform loads will be selected to MUBUF instructions, so they
6305 // have the same legalization requirements as global and private
6306 // loads.
6307 //
6308 }
6309 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6310 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6311 AS == AMDGPUAS::GLOBAL_ADDRESS ||
6312 AS == AMDGPUAS::FLAT_ADDRESS) {
6313 if (NumElements > 4)
6314 return SplitVectorLoad(Op, DAG);
6315 // v4 loads are supported for private and global memory.
6316 return SDValue();
6317 }
6318 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
6319 // Depending on the setting of the private_element_size field in the
6320 // resource descriptor, we can only make private accesses up to a certain
6321 // size.
6322 switch (Subtarget->getMaxPrivateElementSize()) {
6323 case 4:
6324 return scalarizeVectorLoad(Load, DAG);
6325 case 8:
6326 if (NumElements > 2)
6327 return SplitVectorLoad(Op, DAG);
6328 return SDValue();
6329 case 16:
6330 // Same as global/flat
6331 if (NumElements > 4)
6332 return SplitVectorLoad(Op, DAG);
6333 return SDValue();
6334 default:
6335 llvm_unreachable("unsupported private_element_size")::llvm::llvm_unreachable_internal("unsupported private_element_size"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6335)
;
6336 }
6337 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
6338 // Use ds_read_b128 if possible.
6339 if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
6340 MemVT.getStoreSize() == 16)
6341 return SDValue();
6342
6343 if (NumElements > 2)
6344 return SplitVectorLoad(Op, DAG);
6345
6346 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6347 // address is negative, then the instruction is incorrectly treated as
6348 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6349 // loads here to avoid emitting ds_read2_b32. We may re-combine the
6350 // load later in the SILoadStoreOptimizer.
6351 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6352 NumElements == 2 && MemVT.getStoreSize() == 8 &&
6353 Load->getAlignment() < 8) {
6354 return SplitVectorLoad(Op, DAG);
6355 }
6356 }
6357 return SDValue();
6358}
6359
6360SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
6361 EVT VT = Op.getValueType();
6362 assert(VT.getSizeInBits() == 64)((VT.getSizeInBits() == 64) ? static_cast<void> (0) : __assert_fail
("VT.getSizeInBits() == 64", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6362, __PRETTY_FUNCTION__))
;
6363
6364 SDLoc DL(Op);
6365 SDValue Cond = Op.getOperand(0);
6366
6367 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6368 SDValue One = DAG.getConstant(1, DL, MVT::i32);
6369
6370 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6371 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6372
6373 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6374 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
6375
6376 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6377
6378 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6379 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
6380
6381 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6382
6383 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
6384 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
6385}
6386
6387// Catch division cases where we can use shortcuts with rcp and rsq
6388// instructions.
6389SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6390 SelectionDAG &DAG) const {
6391 SDLoc SL(Op);
6392 SDValue LHS = Op.getOperand(0);
6393 SDValue RHS = Op.getOperand(1);
6394 EVT VT = Op.getValueType();
6395 const SDNodeFlags Flags = Op->getFlags();
6396 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
6397
6398 if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6399 return SDValue();
6400
6401 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
6402 if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
6403 if (CLHS->isExactlyValue(1.0)) {
6404 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6405 // the CI documentation has a worst case error of 1 ulp.
6406 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6407 // use it as long as we aren't trying to use denormals.
6408 //
6409 // v_rcp_f16 and v_rsq_f16 DO support denormals.
6410
6411 // 1.0 / sqrt(x) -> rsq(x)
6412
6413 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6414 // error seems really high at 2^29 ULP.
6415 if (RHS.getOpcode() == ISD::FSQRT)
6416 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6417
6418 // 1.0 / x -> rcp(x)
6419 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6420 }
6421
6422 // Same as for 1.0, but expand the sign out of the constant.
6423 if (CLHS->isExactlyValue(-1.0)) {
6424 // -1.0 / x -> rcp (fneg x)
6425 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6426 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6427 }
6428 }
6429 }
6430
6431 if (Unsafe) {
6432 // Turn into multiply by the reciprocal.
6433 // x / y -> x * (1.0 / y)
6434 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6435 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
6436 }
6437
6438 return SDValue();
6439}
6440
6441static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6442 EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6443 if (GlueChain->getNumValues() <= 1) {
6444 return DAG.getNode(Opcode, SL, VT, A, B);
6445 }
6446
6447 assert(GlueChain->getNumValues() == 3)((GlueChain->getNumValues() == 3) ? static_cast<void>
(0) : __assert_fail ("GlueChain->getNumValues() == 3", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6447, __PRETTY_FUNCTION__))
;
6448
6449 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6450 switch (Opcode) {
6451 default: llvm_unreachable("no chain equivalent for opcode")::llvm::llvm_unreachable_internal("no chain equivalent for opcode"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6451)
;
6452 case ISD::FMUL:
6453 Opcode = AMDGPUISD::FMUL_W_CHAIN;
6454 break;
6455 }
6456
6457 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6458 GlueChain.getValue(2));
6459}
6460
6461static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6462 EVT VT, SDValue A, SDValue B, SDValue C,
6463 SDValue GlueChain) {
6464 if (GlueChain->getNumValues() <= 1) {
6465 return DAG.getNode(Opcode, SL, VT, A, B, C);
6466 }
6467
6468 assert(GlueChain->getNumValues() == 3)((GlueChain->getNumValues() == 3) ? static_cast<void>
(0) : __assert_fail ("GlueChain->getNumValues() == 3", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6468, __PRETTY_FUNCTION__))
;
6469
6470 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6471 switch (Opcode) {
6472 default: llvm_unreachable("no chain equivalent for opcode")::llvm::llvm_unreachable_internal("no chain equivalent for opcode"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6472)
;
6473 case ISD::FMA:
6474 Opcode = AMDGPUISD::FMA_W_CHAIN;
6475 break;
6476 }
6477
6478 return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6479 GlueChain.getValue(2));
6480}
6481
6482SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
6483 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6484 return FastLowered;
6485
6486 SDLoc SL(Op);
6487 SDValue Src0 = Op.getOperand(0);
6488 SDValue Src1 = Op.getOperand(1);
6489
6490 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6491 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6492
6493 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
6494 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
6495
6496 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
6497 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
6498
6499 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
6500}
6501
6502// Faster 2.5 ULP division that does not support denormals.
6503SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
6504 SDLoc SL(Op);
6505 SDValue LHS = Op.getOperand(1);
6506 SDValue RHS = Op.getOperand(2);
6507
6508 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
6509
6510 const APFloat K0Val(BitsToFloat(0x6f800000));
6511 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
6512
6513 const APFloat K1Val(BitsToFloat(0x2f800000));
6514 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
6515
6516 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6517
6518 EVT SetCCVT =
6519 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
6520
6521 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
6522
6523 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
6524
6525 // TODO: Should this propagate fast-math-flags?
6526 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
6527
6528 // rcp does not support denormals.
6529 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
6530
6531 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
6532
6533 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
6534}
6535
6536SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
6537 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6538 return FastLowered;
6539
6540 SDLoc SL(Op);
6541 SDValue LHS = Op.getOperand(0);
6542 SDValue RHS = Op.getOperand(1);
6543
6544 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6545
6546 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
6547
6548 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6549 RHS, RHS, LHS);
6550 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
6551 LHS, RHS, LHS);
6552
6553 // Denominator is scaled to not be denormal, so using rcp is ok.
6554 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
6555 DenominatorScaled);
6556 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
6557 DenominatorScaled);
6558
6559 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
6560 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
6561 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
6562
6563 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
6564
6565 if (!Subtarget->hasFP32Denormals()) {
6566 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
6567 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE3,
6568 SL, MVT::i32);
6569 SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
6570 DAG.getEntryNode(),
6571 EnableDenormValue, BitField);
6572 SDValue Ops[3] = {
6573 NegDivScale0,
6574 EnableDenorm.getValue(0),
6575 EnableDenorm.getValue(1)
6576 };
6577
6578 NegDivScale0 = DAG.getMergeValues(Ops, SL);
6579 }
6580
6581 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
6582 ApproxRcp, One, NegDivScale0);
6583
6584 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
6585 ApproxRcp, Fma0);
6586
6587 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
6588 Fma1, Fma1);
6589
6590 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
6591 NumeratorScaled, Mul);
6592
6593 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
6594
6595 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
6596 NumeratorScaled, Fma3);
6597
6598 if (!Subtarget->hasFP32Denormals()) {
6599 const SDValue DisableDenormValue =
6600 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT0, SL, MVT::i32);
6601 SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
6602 Fma4.getValue(1),
6603 DisableDenormValue,
6604 BitField,
6605 Fma4.getValue(2));
6606
6607 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
6608 DisableDenorm, DAG.getRoot());
6609 DAG.setRoot(OutputChain);
6610 }
6611
6612 SDValue Scale = NumeratorScaled.getValue(1);
6613 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
6614 Fma4, Fma1, Fma3, Scale);
6615
6616 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
6617}
6618
6619SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
6620 if (DAG.getTarget().Options.UnsafeFPMath)
6621 return lowerFastUnsafeFDIV(Op, DAG);
6622
6623 SDLoc SL(Op);
6624 SDValue X = Op.getOperand(0);
6625 SDValue Y = Op.getOperand(1);
6626
6627 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
6628
6629 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
6630
6631 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
6632
6633 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
6634
6635 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
6636
6637 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
6638
6639 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
6640
6641 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
6642
6643 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
6644
6645 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
6646 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
6647
6648 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
6649 NegDivScale0, Mul, DivScale1);
6650
6651 SDValue Scale;
6652
6653 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
6654 // Workaround a hardware bug on SI where the condition output from div_scale
6655 // is not usable.
6656
6657 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
6658
6659 // Figure out if the scale to use for div_fmas.
6660 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
6661 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
6662 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
6663 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
6664
6665 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
6666 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
6667
6668 SDValue Scale0Hi
6669 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
6670 SDValue Scale1Hi
6671 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
6672
6673 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
6674 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
6675 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
6676 } else {
6677 Scale = DivScale1.getValue(1);
6678 }
6679
6680 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
6681 Fma4, Fma3, Mul, Scale);
6682
6683 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
6684}
6685
6686SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
6687 EVT VT = Op.getValueType();
6688
6689 if (VT == MVT::f32)
6690 return LowerFDIV32(Op, DAG);
6691
6692 if (VT == MVT::f64)
6693 return LowerFDIV64(Op, DAG);
6694
6695 if (VT == MVT::f16)
6696 return LowerFDIV16(Op, DAG);
6697
6698 llvm_unreachable("Unexpected type for fdiv")::llvm::llvm_unreachable_internal("Unexpected type for fdiv",
"/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6698)
;
6699}
6700
6701SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
6702 SDLoc DL(Op);
6703 StoreSDNode *Store = cast<StoreSDNode>(Op);
6704 EVT VT = Store->getMemoryVT();
6705
6706 if (VT == MVT::i1) {
6707 return DAG.getTruncStore(Store->getChain(), DL,
6708 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
6709 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
6710 }
6711
6712 assert(VT.isVector() &&((VT.isVector() && Store->getValue().getValueType(
).getScalarType() == MVT::i32) ? static_cast<void> (0) :
__assert_fail ("VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6713, __PRETTY_FUNCTION__))
6713 Store->getValue().getValueType().getScalarType() == MVT::i32)((VT.isVector() && Store->getValue().getValueType(
).getScalarType() == MVT::i32) ? static_cast<void> (0) :
__assert_fail ("VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6713, __PRETTY_FUNCTION__))
;
6714
6715 unsigned AS = Store->getAddressSpace();
6716 if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
6717 AS, Store->getAlignment())) {
6718 return expandUnalignedStore(Store, DAG);
6719 }
6720
6721 MachineFunction &MF = DAG.getMachineFunction();
6722 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6723 // If there is a possibilty that flat instruction access scratch memory
6724 // then we need to use the same legalization rules we use for private.
6725 if (AS == AMDGPUAS::FLAT_ADDRESS)
6726 AS = MFI->hasFlatScratchInit() ?
6727 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
6728
6729 unsigned NumElements = VT.getVectorNumElements();
6730 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
6731 AS == AMDGPUAS::FLAT_ADDRESS) {
6732 if (NumElements > 4)
6733 return SplitVectorStore(Op, DAG);
6734 return SDValue();
6735 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
6736 switch (Subtarget->getMaxPrivateElementSize()) {
6737 case 4:
6738 return scalarizeVectorStore(Store, DAG);
6739 case 8:
6740 if (NumElements > 2)
6741 return SplitVectorStore(Op, DAG);
6742 return SDValue();
6743 case 16:
6744 if (NumElements > 4)
6745 return SplitVectorStore(Op, DAG);
6746 return SDValue();
6747 default:
6748 llvm_unreachable("unsupported private_element_size")::llvm::llvm_unreachable_internal("unsupported private_element_size"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6748)
;
6749 }
6750 } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
6751 // Use ds_write_b128 if possible.
6752 if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
6753 VT.getStoreSize() == 16)
6754 return SDValue();
6755
6756 if (NumElements > 2)
6757 return SplitVectorStore(Op, DAG);
6758
6759 // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6760 // address is negative, then the instruction is incorrectly treated as
6761 // out-of-bounds even if base + offsets is in bounds. Split vectorized
6762 // stores here to avoid emitting ds_write2_b32. We may re-combine the
6763 // store later in the SILoadStoreOptimizer.
6764 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6765 NumElements == 2 && VT.getStoreSize() == 8 &&
6766 Store->getAlignment() < 8) {
6767 return SplitVectorStore(Op, DAG);
6768 }
6769
6770 return SDValue();
6771 } else {
6772 llvm_unreachable("unhandled address space")::llvm::llvm_unreachable_internal("unhandled address space", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6772)
;
6773 }
6774}
6775
6776SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
6777 SDLoc DL(Op);
6778 EVT VT = Op.getValueType();
6779 SDValue Arg = Op.getOperand(0);
6780 SDValue TrigVal;
6781
6782 // TODO: Should this propagate fast-math-flags?
6783
6784 SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI3.14159265358979323846, DL, VT);
6785
6786 if (Subtarget->hasTrigReducedRange()) {
6787 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
6788 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
6789 } else {
6790 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
6791 }
6792
6793 switch (Op.getOpcode()) {
6794 case ISD::FCOS:
6795 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
6796 case ISD::FSIN:
6797 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
6798 default:
6799 llvm_unreachable("Wrong trig opcode")::llvm::llvm_unreachable_internal("Wrong trig opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6799)
;
6800 }
6801}
6802
6803SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
6804 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
6805 assert(AtomicNode->isCompareAndSwap())((AtomicNode->isCompareAndSwap()) ? static_cast<void>
(0) : __assert_fail ("AtomicNode->isCompareAndSwap()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 6805, __PRETTY_FUNCTION__))
;
6806 unsigned AS = AtomicNode->getAddressSpace();
6807
6808 // No custom lowering required for local address space
6809 if (!isFlatGlobalAddrSpace(AS))
6810 return Op;
6811
6812 // Non-local address space requires custom lowering for atomic compare
6813 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
6814 SDLoc DL(Op);
6815 SDValue ChainIn = Op.getOperand(0);
6816 SDValue Addr = Op.getOperand(1);
6817 SDValue Old = Op.getOperand(2);
6818 SDValue New = Op.getOperand(3);
6819 EVT VT = Op.getValueType();
6820 MVT SimpleVT = VT.getSimpleVT();
6821 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
6822
6823 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
6824 SDValue Ops[] = { ChainIn, Addr, NewOld };
6825
6826 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
6827 Ops, VT, AtomicNode->getMemOperand());
6828}
6829
6830//===----------------------------------------------------------------------===//
6831// Custom DAG optimizations
6832//===----------------------------------------------------------------------===//
6833
6834SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
6835 DAGCombinerInfo &DCI) const {
6836 EVT VT = N->getValueType(0);
6837 EVT ScalarVT = VT.getScalarType();
6838 if (ScalarVT != MVT::f32)
6839 return SDValue();
6840
6841 SelectionDAG &DAG = DCI.DAG;
6842 SDLoc DL(N);
6843
6844 SDValue Src = N->getOperand(0);
6845 EVT SrcVT = Src.getValueType();
6846
6847 // TODO: We could try to match extracting the higher bytes, which would be
6848 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
6849 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
6850 // about in practice.
6851 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
6852 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
6853 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
6854 DCI.AddToWorklist(Cvt.getNode());
6855 return Cvt;
6856 }
6857 }
6858
6859 return SDValue();
6860}
6861
6862// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
6863
6864// This is a variant of
6865// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
6866//
6867// The normal DAG combiner will do this, but only if the add has one use since
6868// that would increase the number of instructions.
6869//
6870// This prevents us from seeing a constant offset that can be folded into a
6871// memory instruction's addressing mode. If we know the resulting add offset of
6872// a pointer can be folded into an addressing offset, we can replace the pointer
6873// operand with the add of new constant offset. This eliminates one of the uses,
6874// and may allow the remaining use to also be simplified.
6875//
6876SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
6877 unsigned AddrSpace,
6878 EVT MemVT,
6879 DAGCombinerInfo &DCI) const {
6880 SDValue N0 = N->getOperand(0);
6881 SDValue N1 = N->getOperand(1);
6882
6883 // We only do this to handle cases where it's profitable when there are
6884 // multiple uses of the add, so defer to the standard combine.
6885 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
6886 N0->hasOneUse())
6887 return SDValue();
6888
6889 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
6890 if (!CN1)
6891 return SDValue();
6892
6893 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6894 if (!CAdd)
6895 return SDValue();
6896
6897 // If the resulting offset is too large, we can't fold it into the addressing
6898 // mode offset.
6899 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
6900 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
6901
6902 AddrMode AM;
6903 AM.HasBaseReg = true;
6904 AM.BaseOffs = Offset.getSExtValue();
6905 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
6906 return SDValue();
6907
6908 SelectionDAG &DAG = DCI.DAG;
6909 SDLoc SL(N);
6910 EVT VT = N->getValueType(0);
6911
6912 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
6913 SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
6914
6915 SDNodeFlags Flags;
6916 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
6917 (N0.getOpcode() == ISD::OR ||
6918 N0->getFlags().hasNoUnsignedWrap()));
6919
6920 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
6921}
6922
6923SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
6924 DAGCombinerInfo &DCI) const {
6925 SDValue Ptr = N->getBasePtr();
6926 SelectionDAG &DAG = DCI.DAG;
6927 SDLoc SL(N);
6928
6929 // TODO: We could also do this for multiplies.
6930 if (Ptr.getOpcode() == ISD::SHL) {
6931 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
6932 N->getMemoryVT(), DCI);
6933 if (NewPtr) {
6934 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
6935
6936 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
6937 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
6938 }
6939 }
6940
6941 return SDValue();
6942}
6943
6944static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
6945 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
6946 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
6947 (Opc == ISD::XOR && Val == 0);
6948}
6949
6950// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
6951// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
6952// integer combine opportunities since most 64-bit operations are decomposed
6953// this way. TODO: We won't want this for SALU especially if it is an inline
6954// immediate.
6955SDValue SITargetLowering::splitBinaryBitConstantOp(
6956 DAGCombinerInfo &DCI,
6957 const SDLoc &SL,
6958 unsigned Opc, SDValue LHS,
6959 const ConstantSDNode *CRHS) const {
6960 uint64_t Val = CRHS->getZExtValue();
6961 uint32_t ValLo = Lo_32(Val);
6962 uint32_t ValHi = Hi_32(Val);
6963 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6964
6965 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
6966 bitOpWithConstantIsReducible(Opc, ValHi)) ||
6967 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
6968 // If we need to materialize a 64-bit immediate, it will be split up later
6969 // anyway. Avoid creating the harder to understand 64-bit immediate
6970 // materialization.
6971 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
6972 }
6973
6974 return SDValue();
6975}
6976
6977// Returns true if argument is a boolean value which is not serialized into
6978// memory or argument and does not require v_cmdmask_b32 to be deserialized.
6979static bool isBoolSGPR(SDValue V) {
6980 if (V.getValueType() != MVT::i1)
6981 return false;
6982 switch (V.getOpcode()) {
6983 default: break;
6984 case ISD::SETCC:
6985 case ISD::AND:
6986 case ISD::OR:
6987 case ISD::XOR:
6988 case AMDGPUISD::FP_CLASS:
6989 return true;
6990 }
6991 return false;
6992}
6993
6994// If a constant has all zeroes or all ones within each byte return it.
6995// Otherwise return 0.
6996static uint32_t getConstantPermuteMask(uint32_t C) {
6997 // 0xff for any zero byte in the mask
6998 uint32_t ZeroByteMask = 0;
6999 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7000 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7001 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7002 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7003 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7004 if ((NonZeroByteMask & C) != NonZeroByteMask)
7005 return 0; // Partial bytes selected.
7006 return C;
7007}
7008
7009// Check if a node selects whole bytes from its operand 0 starting at a byte
7010// boundary while masking the rest. Returns select mask as in the v_perm_b32
7011// or -1 if not succeeded.
7012// Note byte select encoding:
7013// value 0-3 selects corresponding source byte;
7014// value 0xc selects zero;
7015// value 0xff selects 0xff.
7016static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
7017 assert(V.getValueSizeInBits() == 32)((V.getValueSizeInBits() == 32) ? static_cast<void> (0)
: __assert_fail ("V.getValueSizeInBits() == 32", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 7017, __PRETTY_FUNCTION__))
;
7018
7019 if (V.getNumOperands() != 2)
7020 return ~0;
7021
7022 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
7023 if (!N1)
7024 return ~0;
7025
7026 uint32_t C = N1->getZExtValue();
7027
7028 switch (V.getOpcode()) {
7029 default:
7030 break;
7031 case ISD::AND:
7032 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7033 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7034 }
7035 break;
7036
7037 case ISD::OR:
7038 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7039 return (0x03020100 & ~ConstMask) | ConstMask;
7040 }
7041 break;
7042
7043 case ISD::SHL:
7044 if (C % 8)
7045 return ~0;
7046
7047 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7048
7049 case ISD::SRL:
7050 if (C % 8)
7051 return ~0;
7052
7053 return uint32_t(0x0c0c0c0c03020100ull >> C);
7054 }
7055
7056 return ~0;
7057}
7058
7059SDValue SITargetLowering::performAndCombine(SDNode *N,
7060 DAGCombinerInfo &DCI) const {
7061 if (DCI.isBeforeLegalize())
7062 return SDValue();
7063
7064 SelectionDAG &DAG = DCI.DAG;
7065 EVT VT = N->getValueType(0);
7066 SDValue LHS = N->getOperand(0);
7067 SDValue RHS = N->getOperand(1);
7068
7069
7070 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7071 if (VT == MVT::i64 && CRHS) {
7072 if (SDValue Split
7073 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7074 return Split;
7075 }
7076
7077 if (CRHS && VT == MVT::i32) {
7078 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7079 // nb = number of trailing zeroes in mask
7080 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7081 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7082 uint64_t Mask = CRHS->getZExtValue();
7083 unsigned Bits = countPopulation(Mask);
7084 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7085 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7086 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7087 unsigned Shift = CShift->getZExtValue();
7088 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7089 unsigned Offset = NB + Shift;
7090 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7091 SDLoc SL(N);
7092 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7093 LHS->getOperand(0),
7094 DAG.getConstant(Offset, SL, MVT::i32),
7095 DAG.getConstant(Bits, SL, MVT::i32));
7096 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7097 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7098 DAG.getValueType(NarrowVT));
7099 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7100 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7101 return Shl;
7102 }
7103 }
7104 }
7105
7106 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7107 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7108 isa<ConstantSDNode>(LHS.getOperand(2))) {
7109 uint32_t Sel = getConstantPermuteMask(Mask);
7110 if (!Sel)
7111 return SDValue();
7112
7113 // Select 0xc for all zero bytes
7114 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7115 SDLoc DL(N);
7116 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7117 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7118 }
7119 }
7120
7121 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7122 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7123 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
7124 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7125 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7126
7127 SDValue X = LHS.getOperand(0);
7128 SDValue Y = RHS.getOperand(0);
7129 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7130 return SDValue();
7131
7132 if (LCC == ISD::SETO) {
7133 if (X != LHS.getOperand(1))
7134 return SDValue();
7135
7136 if (RCC == ISD::SETUNE) {
7137 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7138 if (!C1 || !C1->isInfinity() || C1->isNegative())
7139 return SDValue();
7140
7141 const uint32_t Mask = SIInstrFlags::N_NORMAL |
7142 SIInstrFlags::N_SUBNORMAL |
7143 SIInstrFlags::N_ZERO |
7144 SIInstrFlags::P_ZERO |
7145 SIInstrFlags::P_SUBNORMAL |
7146 SIInstrFlags::P_NORMAL;
7147
7148 static_assert(((~(SIInstrFlags::S_NAN |
7149 SIInstrFlags::Q_NAN |
7150 SIInstrFlags::N_INFINITY |
7151 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7152 "mask not equal");
7153
7154 SDLoc DL(N);
7155 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7156 X, DAG.getConstant(Mask, DL, MVT::i32));
7157 }
7158 }
7159 }
7160
7161 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7162 std::swap(LHS, RHS);
7163
7164 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7165 RHS.hasOneUse()) {
7166 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7167 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7168 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7169 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7170 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7171 (RHS.getOperand(0) == LHS.getOperand(0) &&
7172 LHS.getOperand(0) == LHS.getOperand(1))) {
7173 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7174 unsigned NewMask = LCC == ISD::SETO ?
7175 Mask->getZExtValue() & ~OrdMask :
7176 Mask->getZExtValue() & OrdMask;
7177
7178 SDLoc DL(N);
7179 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7180 DAG.getConstant(NewMask, DL, MVT::i32));
7181 }
7182 }
7183
7184 if (VT == MVT::i32 &&
7185 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7186 // and x, (sext cc from i1) => select cc, x, 0
7187 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7188 std::swap(LHS, RHS);
7189 if (isBoolSGPR(RHS.getOperand(0)))
7190 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7191 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7192 }
7193
7194 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7195 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7196 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7197 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7198 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7199 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7200 if (LHSMask != ~0u && RHSMask != ~0u) {
7201 // Canonicalize the expression in an attempt to have fewer unique masks
7202 // and therefore fewer registers used to hold the masks.
7203 if (LHSMask > RHSMask) {
7204 std::swap(LHSMask, RHSMask);
7205 std::swap(LHS, RHS);
7206 }
7207
7208 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7209 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7210 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7211 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7212
7213 // Check of we need to combine values from two sources within a byte.
7214 if (!(LHSUsedLanes & RHSUsedLanes) &&
7215 // If we select high and lower word keep it for SDWA.
7216 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7217 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7218 // Each byte in each mask is either selector mask 0-3, or has higher
7219 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7220 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7221 // mask which is not 0xff wins. By anding both masks we have a correct
7222 // result except that 0x0c shall be corrected to give 0x0c only.
7223 uint32_t Mask = LHSMask & RHSMask;
7224 for (unsigned I = 0; I < 32; I += 8) {
7225 uint32_t ByteSel = 0xff << I;
7226 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7227 Mask &= (0x0c << I) & 0xffffffff;
7228 }
7229
7230 // Add 4 to each active LHS lane. It will not affect any existing 0xff
7231 // or 0x0c.
7232 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7233 SDLoc DL(N);
7234
7235 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7236 LHS.getOperand(0), RHS.getOperand(0),
7237 DAG.getConstant(Sel, DL, MVT::i32));
7238 }
7239 }
7240 }
7241
7242 return SDValue();
7243}
7244
7245SDValue SITargetLowering::performOrCombine(SDNode *N,
7246 DAGCombinerInfo &DCI) const {
7247 SelectionDAG &DAG = DCI.DAG;
7248 SDValue LHS = N->getOperand(0);
7249 SDValue RHS = N->getOperand(1);
7250
7251 EVT VT = N->getValueType(0);
7252 if (VT == MVT::i1) {
7253 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7254 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7255 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7256 SDValue Src = LHS.getOperand(0);
7257 if (Src != RHS.getOperand(0))
7258 return SDValue();
7259
7260 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7261 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7262 if (!CLHS || !CRHS)
7263 return SDValue();
7264
7265 // Only 10 bits are used.
7266 static const uint32_t MaxMask = 0x3ff;
7267
7268 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7269 SDLoc DL(N);
7270 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7271 Src, DAG.getConstant(NewMask, DL, MVT::i32));
7272 }
7273
7274 return SDValue();
7275 }
7276
7277 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7278 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7279 LHS.getOpcode() == AMDGPUISD::PERM &&
7280 isa<ConstantSDNode>(LHS.getOperand(2))) {
7281 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7282 if (!Sel)
7283 return SDValue();
7284
7285 Sel |= LHS.getConstantOperandVal(2);
7286 SDLoc DL(N);
7287 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7288 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7289 }
7290
7291 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7292 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7293 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7294 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7295 uint32_t LHSMask = getPermuteMask(DAG, LHS);
7296 uint32_t RHSMask = getPermuteMask(DAG, RHS);
7297 if (LHSMask != ~0u && RHSMask != ~0u) {
7298 // Canonicalize the expression in an attempt to have fewer unique masks
7299 // and therefore fewer registers used to hold the masks.
7300 if (LHSMask > RHSMask) {
7301 std::swap(LHSMask, RHSMask);
7302 std::swap(LHS, RHS);
7303 }
7304
7305 // Select 0xc for each lane used from source operand. Zero has 0xc mask
7306 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7307 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7308 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7309
7310 // Check of we need to combine values from two sources within a byte.
7311 if (!(LHSUsedLanes & RHSUsedLanes) &&
7312 // If we select high and lower word keep it for SDWA.
7313 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7314 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7315 // Kill zero bytes selected by other mask. Zero value is 0xc.
7316 LHSMask &= ~RHSUsedLanes;
7317 RHSMask &= ~LHSUsedLanes;
7318 // Add 4 to each active LHS lane
7319 LHSMask |= LHSUsedLanes & 0x04040404;
7320 // Combine masks
7321 uint32_t Sel = LHSMask | RHSMask;
7322 SDLoc DL(N);
7323
7324 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7325 LHS.getOperand(0), RHS.getOperand(0),
7326 DAG.getConstant(Sel, DL, MVT::i32));
7327 }
7328 }
7329 }
7330
7331 if (VT != MVT::i64)
7332 return SDValue();
7333
7334 // TODO: This could be a generic combine with a predicate for extracting the
7335 // high half of an integer being free.
7336
7337 // (or i64:x, (zero_extend i32:y)) ->
7338 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7339 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7340 RHS.getOpcode() != ISD::ZERO_EXTEND)
7341 std::swap(LHS, RHS);
7342
7343 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7344 SDValue ExtSrc = RHS.getOperand(0);
7345 EVT SrcVT = ExtSrc.getValueType();
7346 if (SrcVT == MVT::i32) {
7347 SDLoc SL(N);
7348 SDValue LowLHS, HiBits;
7349 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7350 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7351
7352 DCI.AddToWorklist(LowOr.getNode());
7353 DCI.AddToWorklist(HiBits.getNode());
7354
7355 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
7356 LowOr, HiBits);
7357 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7358 }
7359 }
7360
7361 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7362 if (CRHS) {
7363 if (SDValue Split
7364 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7365 return Split;
7366 }
7367
7368 return SDValue();
7369}
7370
7371SDValue SITargetLowering::performXorCombine(SDNode *N,
7372 DAGCombinerInfo &DCI) const {
7373 EVT VT = N->getValueType(0);
7374 if (VT != MVT::i64)
7375 return SDValue();
7376
7377 SDValue LHS = N->getOperand(0);
7378 SDValue RHS = N->getOperand(1);
7379
7380 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7381 if (CRHS) {
7382 if (SDValue Split
7383 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7384 return Split;
7385 }
7386
7387 return SDValue();
7388}
7389
7390// Instructions that will be lowered with a final instruction that zeros the
7391// high result bits.
7392// XXX - probably only need to list legal operations.
7393static bool fp16SrcZerosHighBits(unsigned Opc) {
7394 switch (Opc) {
7395 case ISD::FADD:
7396 case ISD::FSUB:
7397 case ISD::FMUL:
7398 case ISD::FDIV:
7399 case ISD::FREM:
7400 case ISD::FMA:
7401 case ISD::FMAD:
7402 case ISD::FCANONICALIZE:
7403 case ISD::FP_ROUND:
7404 case ISD::UINT_TO_FP:
7405 case ISD::SINT_TO_FP:
7406 case ISD::FABS:
7407 // Fabs is lowered to a bit operation, but it's an and which will clear the
7408 // high bits anyway.
7409 case ISD::FSQRT:
7410 case ISD::FSIN:
7411 case ISD::FCOS:
7412 case ISD::FPOWI:
7413 case ISD::FPOW:
7414 case ISD::FLOG:
7415 case ISD::FLOG2:
7416 case ISD::FLOG10:
7417 case ISD::FEXP:
7418 case ISD::FEXP2:
7419 case ISD::FCEIL:
7420 case ISD::FTRUNC:
7421 case ISD::FRINT:
7422 case ISD::FNEARBYINT:
7423 case ISD::FROUND:
7424 case ISD::FFLOOR:
7425 case ISD::FMINNUM:
7426 case ISD::FMAXNUM:
7427 case AMDGPUISD::FRACT:
7428 case AMDGPUISD::CLAMP:
7429 case AMDGPUISD::COS_HW:
7430 case AMDGPUISD::SIN_HW:
7431 case AMDGPUISD::FMIN3:
7432 case AMDGPUISD::FMAX3:
7433 case AMDGPUISD::FMED3:
7434 case AMDGPUISD::FMAD_FTZ:
7435 case AMDGPUISD::RCP:
7436 case AMDGPUISD::RSQ:
7437 case AMDGPUISD::RCP_IFLAG:
7438 case AMDGPUISD::LDEXP:
7439 return true;
7440 default:
7441 // fcopysign, select and others may be lowered to 32-bit bit operations
7442 // which don't zero the high bits.
7443 return false;
7444 }
7445}
7446
7447SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7448 DAGCombinerInfo &DCI) const {
7449 if (!Subtarget->has16BitInsts() ||
7450 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7451 return SDValue();
7452
7453 EVT VT = N->getValueType(0);
7454 if (VT != MVT::i32)
7455 return SDValue();
7456
7457 SDValue Src = N->getOperand(0);
7458 if (Src.getValueType() != MVT::i16)
7459 return SDValue();
7460
7461 // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7462 // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7463 if (Src.getOpcode() == ISD::BITCAST) {
7464 SDValue BCSrc = Src.getOperand(0);
7465 if (BCSrc.getValueType() == MVT::f16 &&
7466 fp16SrcZerosHighBits(BCSrc.getOpcode()))
7467 return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7468 }
7469
7470 return SDValue();
7471}
7472
7473SDValue SITargetLowering::performClassCombine(SDNode *N,
7474 DAGCombinerInfo &DCI) const {
7475 SelectionDAG &DAG = DCI.DAG;
7476 SDValue Mask = N->getOperand(1);
7477
7478 // fp_class x, 0 -> false
7479 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
7480 if (CMask->isNullValue())
7481 return DAG.getConstant(0, SDLoc(N), MVT::i1);
7482 }
7483
7484 if (N->getOperand(0).isUndef())
7485 return DAG.getUNDEF(MVT::i1);
7486
7487 return SDValue();
7488}
7489
7490SDValue SITargetLowering::performRcpCombine(SDNode *N,
7491 DAGCombinerInfo &DCI) const {
7492 EVT VT = N->getValueType(0);
7493 SDValue N0 = N->getOperand(0);
7494
7495 if (N0.isUndef())
7496 return N0;
7497
7498 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
7499 N0.getOpcode() == ISD::SINT_TO_FP)) {
7500 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
7501 N->getFlags());
7502 }
7503
7504 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
7505}
7506
7507bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
7508 unsigned MaxDepth) const {
7509 unsigned Opcode = Op.getOpcode();
7510 if (Opcode == ISD::FCANONICALIZE)
7511 return true;
7512
7513 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7514 auto F = CFP->getValueAPF();
7515 if (F.isNaN() && F.isSignaling())
7516 return false;
7517 return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
7518 }
7519
7520 // If source is a result of another standard FP operation it is already in
7521 // canonical form.
7522 if (MaxDepth == 0)
7523 return false;
7524
7525 switch (Opcode) {
7526 // These will flush denorms if required.
7527 case ISD::FADD:
7528 case ISD::FSUB:
7529 case ISD::FMUL:
7530 case ISD::FCEIL:
7531 case ISD::FFLOOR:
7532 case ISD::FMA:
7533 case ISD::FMAD:
7534 case ISD::FSQRT:
7535 case ISD::FDIV:
7536 case ISD::FREM:
7537 case ISD::FP_ROUND:
7538 case ISD::FP_EXTEND:
7539 case AMDGPUISD::FMUL_LEGACY:
7540 case AMDGPUISD::FMAD_FTZ:
7541 case AMDGPUISD::RCP:
7542 case AMDGPUISD::RSQ:
7543 case AMDGPUISD::RSQ_CLAMP:
7544 case AMDGPUISD::RCP_LEGACY:
7545 case AMDGPUISD::RSQ_LEGACY:
7546 case AMDGPUISD::RCP_IFLAG:
7547 case AMDGPUISD::TRIG_PREOP:
7548 case AMDGPUISD::DIV_SCALE:
7549 case AMDGPUISD::DIV_FMAS:
7550 case AMDGPUISD::DIV_FIXUP:
7551 case AMDGPUISD::FRACT:
7552 case AMDGPUISD::LDEXP:
7553 case AMDGPUISD::CVT_PKRTZ_F16_F32:
7554 case AMDGPUISD::CVT_F32_UBYTE0:
7555 case AMDGPUISD::CVT_F32_UBYTE1:
7556 case AMDGPUISD::CVT_F32_UBYTE2:
7557 case AMDGPUISD::CVT_F32_UBYTE3:
7558 return true;
7559
7560 // It can/will be lowered or combined as a bit operation.
7561 // Need to check their input recursively to handle.
7562 case ISD::FNEG:
7563 case ISD::FABS:
7564 case ISD::FCOPYSIGN:
7565 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7566
7567 case ISD::FSIN:
7568 case ISD::FCOS:
7569 case ISD::FSINCOS:
7570 return Op.getValueType().getScalarType() != MVT::f16;
7571
7572 case ISD::FMINNUM:
7573 case ISD::FMAXNUM:
7574 case ISD::FMINNUM_IEEE:
7575 case ISD::FMAXNUM_IEEE:
7576 case AMDGPUISD::CLAMP:
7577 case AMDGPUISD::FMED3:
7578 case AMDGPUISD::FMAX3:
7579 case AMDGPUISD::FMIN3: {
7580 // FIXME: Shouldn't treat the generic operations different based these.
7581 // However, we aren't really required to flush the result from
7582 // minnum/maxnum..
7583
7584 // snans will be quieted, so we only need to worry about denormals.
7585 if (Subtarget->supportsMinMaxDenormModes() ||
7586 denormalsEnabledForType(Op.getValueType()))
7587 return true;
7588
7589 // Flushing may be required.
7590 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
7591 // targets need to check their input recursively.
7592
7593 // FIXME: Does this apply with clamp? It's implemented with max.
7594 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
7595 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
7596 return false;
7597 }
7598
7599 return true;
7600 }
7601 case ISD::SELECT: {
7602 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
7603 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
7604 }
7605 case ISD::BUILD_VECTOR: {
7606 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
7607 SDValue SrcOp = Op.getOperand(i);
7608 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
7609 return false;
7610 }
7611
7612 return true;
7613 }
7614 case ISD::EXTRACT_VECTOR_ELT:
7615 case ISD::EXTRACT_SUBVECTOR: {
7616 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
7617 }
7618 case ISD::INSERT_VECTOR_ELT: {
7619 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
7620 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
7621 }
7622 case ISD::UNDEF:
7623 // Could be anything.
7624 return false;
7625
7626 case ISD::BITCAST: {
7627 // Hack round the mess we make when legalizing extract_vector_elt
7628 SDValue Src = Op.getOperand(0);
7629 if (Src.getValueType() == MVT::i16 &&
7630 Src.getOpcode() == ISD::TRUNCATE) {
7631 SDValue TruncSrc = Src.getOperand(0);
7632 if (TruncSrc.getValueType() == MVT::i32 &&
7633 TruncSrc.getOpcode() == ISD::BITCAST &&
7634 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
7635 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
7636 }
7637 }
7638
7639 return false;
7640 }
7641 case ISD::INTRINSIC_WO_CHAIN: {
7642 unsigned IntrinsicID
7643 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7644 // TODO: Handle more intrinsics
7645 switch (IntrinsicID) {
7646 case Intrinsic::amdgcn_cvt_pkrtz:
7647 case Intrinsic::amdgcn_cubeid:
7648 case Intrinsic::amdgcn_frexp_mant:
7649 case Intrinsic::amdgcn_fdot2:
7650 return true;
7651 default:
7652 break;
7653 }
7654
7655 LLVM_FALLTHROUGH[[clang::fallthrough]];
7656 }
7657 default:
7658 return denormalsEnabledForType(Op.getValueType()) &&
7659 DAG.isKnownNeverSNaN(Op);
7660 }
7661
7662 llvm_unreachable("invalid operation")::llvm::llvm_unreachable_internal("invalid operation", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 7662)
;
7663}
7664
7665// Constant fold canonicalize.
7666SDValue SITargetLowering::getCanonicalConstantFP(
7667 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
7668 // Flush denormals to 0 if not enabled.
7669 if (C.isDenormal() && !denormalsEnabledForType(VT))
7670 return DAG.getConstantFP(0.0, SL, VT);
7671
7672 if (C.isNaN()) {
7673 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
7674 if (C.isSignaling()) {
7675 // Quiet a signaling NaN.
7676 // FIXME: Is this supposed to preserve payload bits?
7677 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7678 }
7679
7680 // Make sure it is the canonical NaN bitpattern.
7681 //
7682 // TODO: Can we use -1 as the canonical NaN value since it's an inline
7683 // immediate?
7684 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
7685 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
7686 }
7687
7688 // Already canonical.
7689 return DAG.getConstantFP(C, SL, VT);
7690}
7691
7692static bool vectorEltWillFoldAway(SDValue Op) {
7693 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
7694}
7695
7696SDValue SITargetLowering::performFCanonicalizeCombine(
7697 SDNode *N,
7698 DAGCombinerInfo &DCI) const {
7699 SelectionDAG &DAG = DCI.DAG;
7700 SDValue N0 = N->getOperand(0);
7701 EVT VT = N->getValueType(0);
7702
7703 // fcanonicalize undef -> qnan
7704 if (N0.isUndef()) {
7705 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
7706 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
7707 }
7708
7709 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
7710 EVT VT = N->getValueType(0);
7711 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
7712 }
7713
7714 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
7715 // (fcanonicalize k)
7716 //
7717 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
7718
7719 // TODO: This could be better with wider vectors that will be split to v2f16,
7720 // and to consider uses since there aren't that many packed operations.
7721 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
7722 isTypeLegal(MVT::v2f16)) {
7723 SDLoc SL(N);
7724 SDValue NewElts[2];
7725 SDValue Lo = N0.getOperand(0);
7726 SDValue Hi = N0.getOperand(1);
7727 EVT EltVT = Lo.getValueType();
7728
7729 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
7730 for (unsigned I = 0; I != 2; ++I) {
7731 SDValue Op = N0.getOperand(I);
7732 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
7733 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
7734 CFP->getValueAPF());
7735 } else if (Op.isUndef()) {
7736 // Handled below based on what the other operand is.
7737 NewElts[I] = Op;
7738 } else {
7739 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
7740 }
7741 }
7742
7743 // If one half is undef, and one is constant, perfer a splat vector rather
7744 // than the normal qNaN. If it's a register, prefer 0.0 since that's
7745 // cheaper to use and may be free with a packed operation.
7746 if (NewElts[0].isUndef()) {
7747 if (isa<ConstantFPSDNode>(NewElts[1]))
7748 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
7749 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
7750 }
7751
7752 if (NewElts[1].isUndef()) {
7753 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
7754 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
7755 }
7756
7757 return DAG.getBuildVector(VT, SL, NewElts);
7758 }
7759 }
7760
7761 unsigned SrcOpc = N0.getOpcode();
7762
7763 // If it's free to do so, push canonicalizes further up the source, which may
7764 // find a canonical source.
7765 //
7766 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
7767 // sNaNs.
7768 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
7769 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
7770 if (CRHS && N0.hasOneUse()) {
7771 SDLoc SL(N);
7772 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
7773 N0.getOperand(0));
7774 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
7775 DCI.AddToWorklist(Canon0.getNode());
7776
7777 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
7778 }
7779 }
7780
7781 return isCanonicalized(DAG, N0) ? N0 : SDValue();
7782}
7783
7784static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
7785 switch (Opc) {
7786 case ISD::FMAXNUM:
7787 case ISD::FMAXNUM_IEEE:
7788 return AMDGPUISD::FMAX3;
7789 case ISD::SMAX:
7790 return AMDGPUISD::SMAX3;
7791 case ISD::UMAX:
7792 return AMDGPUISD::UMAX3;
7793 case ISD::FMINNUM:
7794 case ISD::FMINNUM_IEEE:
7795 return AMDGPUISD::FMIN3;
7796 case ISD::SMIN:
7797 return AMDGPUISD::SMIN3;
7798 case ISD::UMIN:
7799 return AMDGPUISD::UMIN3;
7800 default:
7801 llvm_unreachable("Not a min/max opcode")::llvm::llvm_unreachable_internal("Not a min/max opcode", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 7801)
;
7802 }
7803}
7804
7805SDValue SITargetLowering::performIntMed3ImmCombine(
7806 SelectionDAG &DAG, const SDLoc &SL,
7807 SDValue Op0, SDValue Op1, bool Signed) const {
7808 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
7809 if (!K1)
7810 return SDValue();
7811
7812 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
7813 if (!K0)
7814 return SDValue();
7815
7816 if (Signed) {
7817 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
7818 return SDValue();
7819 } else {
7820 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
7821 return SDValue();
7822 }
7823
7824 EVT VT = K0->getValueType(0);
7825 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
7826 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
7827 return DAG.getNode(Med3Opc, SL, VT,
7828 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
7829 }
7830
7831 // If there isn't a 16-bit med3 operation, convert to 32-bit.
7832 MVT NVT = MVT::i32;
7833 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7834
7835 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
7836 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
7837 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
7838
7839 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
7840 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
7841}
7842
7843static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
7844 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
7845 return C;
7846
7847 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
7848 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
7849 return C;
7850 }
7851
7852 return nullptr;
7853}
7854
7855SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
7856 const SDLoc &SL,
7857 SDValue Op0,
7858 SDValue Op1) const {
7859 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
7860 if (!K1)
7861 return SDValue();
7862
7863 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
7864 if (!K0)
7865 return SDValue();
7866
7867 // Ordered >= (although NaN inputs should have folded away by now).
7868 APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
7869 if (Cmp == APFloat::cmpGreaterThan)
7870 return SDValue();
7871
7872 // TODO: Check IEEE bit enabled?
7873 EVT VT = Op0.getValueType();
7874 if (Subtarget->enableDX10Clamp()) {
7875 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
7876 // hardware fmed3 behavior converting to a min.
7877 // FIXME: Should this be allowing -0.0?
7878 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
7879 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
7880 }
7881
7882 // med3 for f16 is only available on gfx9+, and not available for v2f16.
7883 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
7884 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
7885 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
7886 // then give the other result, which is different from med3 with a NaN
7887 // input.
7888 SDValue Var = Op0.getOperand(0);
7889 if (!DAG.isKnownNeverSNaN(Var))
7890 return SDValue();
7891
7892 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7893
7894 if ((!K0->hasOneUse() ||
7895 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
7896 (!K1->hasOneUse() ||
7897 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
7898 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
7899 Var, SDValue(K0, 0), SDValue(K1, 0));
7900 }
7901 }
7902
7903 return SDValue();
7904}
7905
7906SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
7907 DAGCombinerInfo &DCI) const {
7908 SelectionDAG &DAG = DCI.DAG;
7909
7910 EVT VT = N->getValueType(0);
7911 unsigned Opc = N->getOpcode();
7912 SDValue Op0 = N->getOperand(0);
7913 SDValue Op1 = N->getOperand(1);
7914
7915 // Only do this if the inner op has one use since this will just increases
7916 // register pressure for no benefit.
7917
7918
7919 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
7920 !VT.isVector() && VT != MVT::f64 &&
7921 ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
7922 // max(max(a, b), c) -> max3(a, b, c)
7923 // min(min(a, b), c) -> min3(a, b, c)
7924 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
7925 SDLoc DL(N);
7926 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
7927 DL,
7928 N->getValueType(0),
7929 Op0.getOperand(0),
7930 Op0.getOperand(1),
7931 Op1);
7932 }
7933
7934 // Try commuted.
7935 // max(a, max(b, c)) -> max3(a, b, c)
7936 // min(a, min(b, c)) -> min3(a, b, c)
7937 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
7938 SDLoc DL(N);
7939 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
7940 DL,
7941 N->getValueType(0),
7942 Op0,
7943 Op1.getOperand(0),
7944 Op1.getOperand(1));
7945 }
7946 }
7947
7948 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
7949 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
7950 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
7951 return Med3;
7952 }
7953
7954 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
7955 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
7956 return Med3;
7957 }
7958
7959 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
7960 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
7961 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
7962 (Opc == AMDGPUISD::FMIN_LEGACY &&
7963 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
7964 (VT == MVT::f32 || VT == MVT::f64 ||
7965 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
7966 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
7967 Op0.hasOneUse()) {
7968 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
7969 return Res;
7970 }
7971
7972 return SDValue();
7973}
7974
7975static bool isClampZeroToOne(SDValue A, SDValue B) {
7976 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
7977 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
7978 // FIXME: Should this be allowing -0.0?
7979 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
7980 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
7981 }
7982 }
7983
7984 return false;
7985}
7986
7987// FIXME: Should only worry about snans for version with chain.
7988SDValue SITargetLowering::performFMed3Combine(SDNode *N,
7989 DAGCombinerInfo &DCI) const {
7990 EVT VT = N->getValueType(0);
7991 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
7992 // NaNs. With a NaN input, the order of the operands may change the result.
7993
7994 SelectionDAG &DAG = DCI.DAG;
7995 SDLoc SL(N);
7996
7997 SDValue Src0 = N->getOperand(0);
7998 SDValue Src1 = N->getOperand(1);
7999 SDValue Src2 = N->getOperand(2);
8000
8001 if (isClampZeroToOne(Src0, Src1)) {
8002 // const_a, const_b, x -> clamp is safe in all cases including signaling
8003 // nans.
8004 // FIXME: Should this be allowing -0.0?
8005 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8006 }
8007
8008 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8009 // handling no dx10-clamp?
8010 if (Subtarget->enableDX10Clamp()) {
8011 // If NaNs is clamped to 0, we are free to reorder the inputs.
8012
8013 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8014 std::swap(Src0, Src1);
8015
8016 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8017 std::swap(Src1, Src2);
8018
8019 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8020 std::swap(Src0, Src1);
8021
8022 if (isClampZeroToOne(Src1, Src2))
8023 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8024 }
8025
8026 return SDValue();
8027}
8028
8029SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8030 DAGCombinerInfo &DCI) const {
8031 SDValue Src0 = N->getOperand(0);
8032 SDValue Src1 = N->getOperand(1);
8033 if (Src0.isUndef() && Src1.isUndef())
8034 return DCI.DAG.getUNDEF(N->getValueType(0));
8035 return SDValue();
8036}
8037
8038SDValue SITargetLowering::performExtractVectorEltCombine(
8039 SDNode *N, DAGCombinerInfo &DCI) const {
8040 SDValue Vec = N->getOperand(0);
8041 SelectionDAG &DAG = DCI.DAG;
8042
8043 EVT VecVT = Vec.getValueType();
8044 EVT EltVT = VecVT.getVectorElementType();
8045
8046 if ((Vec.getOpcode() == ISD::FNEG ||
8047 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
8048 SDLoc SL(N);
8049 EVT EltVT = N->getValueType(0);
8050 SDValue Idx = N->getOperand(1);
8051 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8052 Vec.getOperand(0), Idx);
8053 return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
8054 }
8055
8056 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8057 // =>
8058 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8059 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8060 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
8061 if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
8062 SDLoc SL(N);
8063 EVT EltVT = N->getValueType(0);
8064 SDValue Idx = N->getOperand(1);
8065 unsigned Opc = Vec.getOpcode();
8066
8067 switch(Opc) {
8068 default:
8069 return SDValue();
8070 // TODO: Support other binary operations.
8071 case ISD::FADD:
8072 case ISD::FSUB:
8073 case ISD::FMUL:
8074 case ISD::ADD:
8075 case ISD::UMIN:
8076 case ISD::UMAX:
8077 case ISD::SMIN:
8078 case ISD::SMAX:
8079 case ISD::FMAXNUM:
8080 case ISD::FMINNUM:
8081 case ISD::FMAXNUM_IEEE:
8082 case ISD::FMINNUM_IEEE: {
8083 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8084 Vec.getOperand(0), Idx);
8085 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8086 Vec.getOperand(1), Idx);
8087
8088 DCI.AddToWorklist(Elt0.getNode());
8089 DCI.AddToWorklist(Elt1.getNode());
8090 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8091 }
8092 }
8093 }
8094
8095 if (!DCI.isBeforeLegalize())
8096 return SDValue();
8097
8098 unsigned VecSize = VecVT.getSizeInBits();
8099 unsigned EltSize = EltVT.getSizeInBits();
8100
8101 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8102 // elements. This exposes more load reduction opportunities by replacing
8103 // multiple small extract_vector_elements with a single 32-bit extract.
8104 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
8105 if (isa<MemSDNode>(Vec) &&
8106 EltSize <= 16 &&
8107 EltVT.isByteSized() &&
8108 VecSize > 32 &&
8109 VecSize % 32 == 0 &&
8110 Idx) {
8111 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8112
8113 unsigned BitIndex = Idx->getZExtValue() * EltSize;
8114 unsigned EltIdx = BitIndex / 32;
8115 unsigned LeftoverBitIdx = BitIndex % 32;
8116 SDLoc SL(N);
8117
8118 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8119 DCI.AddToWorklist(Cast.getNode());
8120
8121 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8122 DAG.getConstant(EltIdx, SL, MVT::i32));
8123 DCI.AddToWorklist(Elt.getNode());
8124 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8125 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8126 DCI.AddToWorklist(Srl.getNode());
8127
8128 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8129 DCI.AddToWorklist(Trunc.getNode());
8130 return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8131 }
8132
8133 return SDValue();
8134}
8135
8136static bool convertBuildVectorCastElt(SelectionDAG &DAG,
8137 SDValue &Lo, SDValue &Hi) {
8138 if (Hi.getOpcode() == ISD::BITCAST &&
8139 Hi.getOperand(0).getValueType() == MVT::f16 &&
8140 (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
8141 Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
8142 Hi = Hi.getOperand(0);
8143 return true;
8144 }
8145
8146 return false;
8147}
8148
8149SDValue SITargetLowering::performBuildVectorCombine(
8150 SDNode *N, DAGCombinerInfo &DCI) const {
8151 SDLoc SL(N);
8152
8153 if (!isTypeLegal(MVT::v2i16))
8154 return SDValue();
8155 SelectionDAG &DAG = DCI.DAG;
8156 EVT VT = N->getValueType(0);
8157
8158 if (VT == MVT::v2i16) {
8159 SDValue Lo = N->getOperand(0);
8160 SDValue Hi = N->getOperand(1);
8161
8162 // v2i16 build_vector (const|undef), (bitcast f16:$x)
8163 // -> bitcast (v2f16 build_vector const|undef, $x
8164 if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
8165 SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
8166 return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
8167 }
8168
8169 if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
8170 SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
8171 return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
8172 }
8173 }
8174
8175 return SDValue();
8176}
8177
8178unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8179 const SDNode *N0,
8180 const SDNode *N1) const {
8181 EVT VT = N0->getValueType(0);
8182
8183 // Only do this if we are not trying to support denormals. v_mad_f32 does not
8184 // support denormals ever.
8185 if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8186 (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
8187 return ISD::FMAD;
8188
8189 const TargetOptions &Options = DAG.getTarget().Options;
8190 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8191 (N0->getFlags().hasAllowContract() &&
8192 N1->getFlags().hasAllowContract())) &&
8193 isFMAFasterThanFMulAndFAdd(VT)) {
8194 return ISD::FMA;
8195 }
8196
8197 return 0;
8198}
8199
8200static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8201 EVT VT,
8202 SDValue N0, SDValue N1, SDValue N2,
8203 bool Signed) {
8204 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8205 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8206 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8207 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8208}
8209
8210SDValue SITargetLowering::performAddCombine(SDNode *N,
8211 DAGCombinerInfo &DCI) const {
8212 SelectionDAG &DAG = DCI.DAG;
8213 EVT VT = N->getValueType(0);
8214 SDLoc SL(N);
8215 SDValue LHS = N->getOperand(0);
8216 SDValue RHS = N->getOperand(1);
8217
8218 if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8219 && Subtarget->hasMad64_32() &&
8220 !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8221 VT.getScalarSizeInBits() <= 64) {
8222 if (LHS.getOpcode() != ISD::MUL)
8223 std::swap(LHS, RHS);
8224
8225 SDValue MulLHS = LHS.getOperand(0);
8226 SDValue MulRHS = LHS.getOperand(1);
8227 SDValue AddRHS = RHS;
8228
8229 // TODO: Maybe restrict if SGPR inputs.
8230 if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8231 numBitsUnsigned(MulRHS, DAG) <= 32) {
8232 MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8233 MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8234 AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8235 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8236 }
8237
8238 if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8239 MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8240 MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8241 AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8242 return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8243 }
8244
8245 return SDValue();
8246 }
8247
8248 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
8249 return SDValue();
8250
8251 // add x, zext (setcc) => addcarry x, 0, setcc
8252 // add x, sext (setcc) => subcarry x, 0, setcc
8253 unsigned Opc = LHS.getOpcode();
8254 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
8255 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
8256 std::swap(RHS, LHS);
8257
8258 Opc = RHS.getOpcode();
8259 switch (Opc) {
8260 default: break;
8261 case ISD::ZERO_EXTEND:
8262 case ISD::SIGN_EXTEND:
8263 case ISD::ANY_EXTEND: {
8264 auto Cond = RHS.getOperand(0);
8265 if (!isBoolSGPR(Cond))
8266 break;
8267 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8268 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8269 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8270 return DAG.getNode(Opc, SL, VTList, Args);
8271 }
8272 case ISD::ADDCARRY: {
8273 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8274 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8275 if (!C || C->getZExtValue() != 0) break;
8276 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8277 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8278 }
8279 }
8280 return SDValue();
8281}
8282
8283SDValue SITargetLowering::performSubCombine(SDNode *N,
8284 DAGCombinerInfo &DCI) const {
8285 SelectionDAG &DAG = DCI.DAG;
8286 EVT VT = N->getValueType(0);
8287
8288 if (VT != MVT::i32)
8289 return SDValue();
8290
8291 SDLoc SL(N);
8292 SDValue LHS = N->getOperand(0);
8293 SDValue RHS = N->getOperand(1);
8294
8295 unsigned Opc = LHS.getOpcode();
8296 if (Opc != ISD::SUBCARRY)
8297 std::swap(RHS, LHS);
8298
8299 if (LHS.getOpcode() == ISD::SUBCARRY) {
8300 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8301 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
8302 if (!C || C->getZExtValue() != 0)
8303 return SDValue();
8304 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8305 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8306 }
8307 return SDValue();
8308}
8309
8310SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8311 DAGCombinerInfo &DCI) const {
8312
8313 if (N->getValueType(0) != MVT::i32)
8314 return SDValue();
8315
8316 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8317 if (!C || C->getZExtValue() != 0)
8318 return SDValue();
8319
8320 SelectionDAG &DAG = DCI.DAG;
8321 SDValue LHS = N->getOperand(0);
8322
8323 // addcarry (add x, y), 0, cc => addcarry x, y, cc
8324 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8325 unsigned LHSOpc = LHS.getOpcode();
8326 unsigned Opc = N->getOpcode();
8327 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8328 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8329 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8330 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
8331 }
8332 return SDValue();
8333}
8334
8335SDValue SITargetLowering::performFAddCombine(SDNode *N,
8336 DAGCombinerInfo &DCI) const {
8337 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8338 return SDValue();
8339
8340 SelectionDAG &DAG = DCI.DAG;
8341 EVT VT = N->getValueType(0);
8342
8343 SDLoc SL(N);
8344 SDValue LHS = N->getOperand(0);
8345 SDValue RHS = N->getOperand(1);
8346
8347 // These should really be instruction patterns, but writing patterns with
8348 // source modiifiers is a pain.
8349
8350 // fadd (fadd (a, a), b) -> mad 2.0, a, b
8351 if (LHS.getOpcode() == ISD::FADD) {
8352 SDValue A = LHS.getOperand(0);
8353 if (A == LHS.getOperand(1)) {
8354 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8355 if (FusedOp != 0) {
8356 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8357 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
8358 }
8359 }
8360 }
8361
8362 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8363 if (RHS.getOpcode() == ISD::FADD) {
8364 SDValue A = RHS.getOperand(0);
8365 if (A == RHS.getOperand(1)) {
8366 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8367 if (FusedOp != 0) {
8368 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8369 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
8370 }
8371 }
8372 }
8373
8374 return SDValue();
8375}
8376
8377SDValue SITargetLowering::performFSubCombine(SDNode *N,
8378 DAGCombinerInfo &DCI) const {
8379 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8380 return SDValue();
8381
8382 SelectionDAG &DAG = DCI.DAG;
8383 SDLoc SL(N);
8384 EVT VT = N->getValueType(0);
8385 assert(!VT.isVector())((!VT.isVector()) ? static_cast<void> (0) : __assert_fail
("!VT.isVector()", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 8385, __PRETTY_FUNCTION__))
;
8386
8387 // Try to get the fneg to fold into the source modifier. This undoes generic
8388 // DAG combines and folds them into the mad.
8389 //
8390 // Only do this if we are not trying to support denormals. v_mad_f32 does
8391 // not support denormals ever.
8392 SDValue LHS = N->getOperand(0);
8393 SDValue RHS = N->getOperand(1);
8394 if (LHS.getOpcode() == ISD::FADD) {
8395 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8396 SDValue A = LHS.getOperand(0);
8397 if (A == LHS.getOperand(1)) {
8398 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8399 if (FusedOp != 0){
8400 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8401 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8402
8403 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
8404 }
8405 }
8406 }
8407
8408 if (RHS.getOpcode() == ISD::FADD) {
8409 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
8410
8411 SDValue A = RHS.getOperand(0);
8412 if (A == RHS.getOperand(1)) {
8413 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8414 if (FusedOp != 0){
8415 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
8416 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
8417 }
8418 }
8419 }
8420
8421 return SDValue();
8422}
8423
8424SDValue SITargetLowering::performFMACombine(SDNode *N,
8425 DAGCombinerInfo &DCI) const {
8426 SelectionDAG &DAG = DCI.DAG;
8427 EVT VT = N->getValueType(0);
8428 SDLoc SL(N);
8429
8430 if (!Subtarget->hasDLInsts() || VT != MVT::f32)
8431 return SDValue();
8432
8433 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
8434 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
8435 SDValue Op1 = N->getOperand(0);
8436 SDValue Op2 = N->getOperand(1);
8437 SDValue FMA = N->getOperand(2);
8438
8439 if (FMA.getOpcode() != ISD::FMA ||
8440 Op1.getOpcode() != ISD::FP_EXTEND ||
8441 Op2.getOpcode() != ISD::FP_EXTEND)
8442 return SDValue();
8443
8444 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
8445 // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
8446 // is sufficient to allow generaing fdot2.
8447 const TargetOptions &Options = DAG.getTarget().Options;
8448 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8449 (N->getFlags().hasAllowContract() &&
8450 FMA->getFlags().hasAllowContract())) {
8451 Op1 = Op1.getOperand(0);
8452 Op2 = Op2.getOperand(0);
8453 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8454 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8455 return SDValue();
8456
8457 SDValue Vec1 = Op1.getOperand(0);
8458 SDValue Idx1 = Op1.getOperand(1);
8459 SDValue Vec2 = Op2.getOperand(0);
8460
8461 SDValue FMAOp1 = FMA.getOperand(0);
8462 SDValue FMAOp2 = FMA.getOperand(1);
8463 SDValue FMAAcc = FMA.getOperand(2);
8464
8465 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
8466 FMAOp2.getOpcode() != ISD::FP_EXTEND)
8467 return SDValue();
8468
8469 FMAOp1 = FMAOp1.getOperand(0);
8470 FMAOp2 = FMAOp2.getOperand(0);
8471 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8472 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8473 return SDValue();
8474
8475 SDValue Vec3 = FMAOp1.getOperand(0);
8476 SDValue Vec4 = FMAOp2.getOperand(0);
8477 SDValue Idx2 = FMAOp1.getOperand(1);
8478
8479 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
8480 // Idx1 and Idx2 cannot be the same.
8481 Idx1 == Idx2)
8482 return SDValue();
8483
8484 if (Vec1 == Vec2 || Vec3 == Vec4)
8485 return SDValue();
8486
8487 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
8488 return SDValue();
8489
8490 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
8491 (Vec1 == Vec4 && Vec2 == Vec3)) {
8492 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
8493 DAG.getTargetConstant(0, SL, MVT::i1));
8494 }
8495 }
8496 return SDValue();
8497}
8498
8499SDValue SITargetLowering::performSetCCCombine(SDNode *N,
8500 DAGCombinerInfo &DCI) const {
8501 SelectionDAG &DAG = DCI.DAG;
8502 SDLoc SL(N);
8503
8504 SDValue LHS = N->getOperand(0);
8505 SDValue RHS = N->getOperand(1);
8506 EVT VT = LHS.getValueType();
8507 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
8508
8509 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
8510 if (!CRHS) {
8511 CRHS = dyn_cast<ConstantSDNode>(LHS);
8512 if (CRHS) {
8513 std::swap(LHS, RHS);
8514 CC = getSetCCSwappedOperands(CC);
8515 }
8516 }
8517
8518 if (CRHS) {
8519 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
8520 isBoolSGPR(LHS.getOperand(0))) {
8521 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
8522 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
8523 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
8524 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
8525 if ((CRHS->isAllOnesValue() &&
8526 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
8527 (CRHS->isNullValue() &&
8528 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
8529 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8530 DAG.getConstant(-1, SL, MVT::i1));
8531 if ((CRHS->isAllOnesValue() &&
8532 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
8533 (CRHS->isNullValue() &&
8534 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
8535 return LHS.getOperand(0);
8536 }
8537
8538 uint64_t CRHSVal = CRHS->getZExtValue();
8539 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
8540 LHS.getOpcode() == ISD::SELECT &&
8541 isa<ConstantSDNode>(LHS.getOperand(1)) &&
8542 isa<ConstantSDNode>(LHS.getOperand(2)) &&
8543 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
8544 isBoolSGPR(LHS.getOperand(0))) {
8545 // Given CT != FT:
8546 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
8547 // setcc (select cc, CT, CF), CF, ne => cc
8548 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
8549 // setcc (select cc, CT, CF), CT, eq => cc
8550 uint64_t CT = LHS.getConstantOperandVal(1);
8551 uint64_t CF = LHS.getConstantOperandVal(2);
8552
8553 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
8554 (CT == CRHSVal && CC == ISD::SETNE))
8555 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
8556 DAG.getConstant(-1, SL, MVT::i1));
8557 if ((CF == CRHSVal && CC == ISD::SETNE) ||
8558 (CT == CRHSVal && CC == ISD::SETEQ))
8559 return LHS.getOperand(0);
8560 }
8561 }
8562
8563 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
8564 VT != MVT::f16))
8565 return SDValue();
8566
8567 // Match isinf/isfinite pattern
8568 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
8569 // (fcmp one (fabs x), inf) -> (fp_class x,
8570 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
8571 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
8572 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
8573 if (!CRHS)
8574 return SDValue();
8575
8576 const APFloat &APF = CRHS->getValueAPF();
8577 if (APF.isInfinity() && !APF.isNegative()) {
8578 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
8579 SIInstrFlags::N_INFINITY;
8580 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
8581 SIInstrFlags::P_ZERO |
8582 SIInstrFlags::N_NORMAL |
8583 SIInstrFlags::P_NORMAL |
8584 SIInstrFlags::N_SUBNORMAL |
8585 SIInstrFlags::P_SUBNORMAL;
8586 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
8587 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
8588 DAG.getConstant(Mask, SL, MVT::i32));
8589 }
8590 }
8591
8592 return SDValue();
8593}
8594
8595SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
8596 DAGCombinerInfo &DCI) const {
8597 SelectionDAG &DAG = DCI.DAG;
8598 SDLoc SL(N);
8599 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
8600
8601 SDValue Src = N->getOperand(0);
8602 SDValue Srl = N->getOperand(0);
8603 if (Srl.getOpcode() == ISD::ZERO_EXTEND)
8604 Srl = Srl.getOperand(0);
8605
8606 // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
8607 if (Srl.getOpcode() == ISD::SRL) {
8608 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
8609 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
8610 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
8611
8612 if (const ConstantSDNode *C =
8613 dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
8614 Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
8615 EVT(MVT::i32));
8616
8617 unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
8618 if (SrcOffset < 32 && SrcOffset % 8 == 0) {
8619 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
8620 MVT::f32, Srl);
8621 }
8622 }
8623 }
8624
8625 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
8626
8627 KnownBits Known;
8628 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
8629 !DCI.isBeforeLegalizeOps());
8630 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8631 if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
8632 TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
8633 DCI.CommitTargetLoweringOpt(TLO);
8634 }
8635
8636 return SDValue();
8637}
8638
8639SDValue SITargetLowering::performClampCombine(SDNode *N,
8640 DAGCombinerInfo &DCI) const {
8641 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
8642 if (!CSrc)
8643 return SDValue();
8644
8645 const APFloat &F = CSrc->getValueAPF();
8646 APFloat Zero = APFloat::getZero(F.getSemantics());
8647 APFloat::cmpResult Cmp0 = F.compare(Zero);
8648 if (Cmp0 == APFloat::cmpLessThan ||
8649 (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
8650 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
8651 }
8652
8653 APFloat One(F.getSemantics(), "1.0");
8654 APFloat::cmpResult Cmp1 = F.compare(One);
8655 if (Cmp1 == APFloat::cmpGreaterThan)
8656 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
8657
8658 return SDValue(CSrc, 0);
8659}
8660
8661
8662SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
8663 DAGCombinerInfo &DCI) const {
8664 switch (N->getOpcode()) {
8665 default:
8666 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
8667 case ISD::ADD:
8668 return performAddCombine(N, DCI);
8669 case ISD::SUB:
8670 return performSubCombine(N, DCI);
8671 case ISD::ADDCARRY:
8672 case ISD::SUBCARRY:
8673 return performAddCarrySubCarryCombine(N, DCI);
8674 case ISD::FADD:
8675 return performFAddCombine(N, DCI);
8676 case ISD::FSUB:
8677 return performFSubCombine(N, DCI);
8678 case ISD::SETCC:
8679 return performSetCCCombine(N, DCI);
8680 case ISD::FMAXNUM:
8681 case ISD::FMINNUM:
8682 case ISD::FMAXNUM_IEEE:
8683 case ISD::FMINNUM_IEEE:
8684 case ISD::SMAX:
8685 case ISD::SMIN:
8686 case ISD::UMAX:
8687 case ISD::UMIN:
8688 case AMDGPUISD::FMIN_LEGACY:
8689 case AMDGPUISD::FMAX_LEGACY: {
8690 if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
8691 getTargetMachine().getOptLevel() > CodeGenOpt::None)
8692 return performMinMaxCombine(N, DCI);
8693 break;
8694 }
8695 case ISD::FMA:
8696 return performFMACombine(N, DCI);
8697 case ISD::LOAD: {
8698 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
8699 return Widended;
8700 LLVM_FALLTHROUGH[[clang::fallthrough]];
8701 }
8702 case ISD::STORE:
8703 case ISD::ATOMIC_LOAD:
8704 case ISD::ATOMIC_STORE:
8705 case ISD::ATOMIC_CMP_SWAP:
8706 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
8707 case ISD::ATOMIC_SWAP:
8708 case ISD::ATOMIC_LOAD_ADD:
8709 case ISD::ATOMIC_LOAD_SUB:
8710 case ISD::ATOMIC_LOAD_AND:
8711 case ISD::ATOMIC_LOAD_OR:
8712 case ISD::ATOMIC_LOAD_XOR:
8713 case ISD::ATOMIC_LOAD_NAND:
8714 case ISD::ATOMIC_LOAD_MIN:
8715 case ISD::ATOMIC_LOAD_MAX:
8716 case ISD::ATOMIC_LOAD_UMIN:
8717 case ISD::ATOMIC_LOAD_UMAX:
8718 case AMDGPUISD::ATOMIC_INC:
8719 case AMDGPUISD::ATOMIC_DEC:
8720 case AMDGPUISD::ATOMIC_LOAD_FADD:
8721 case AMDGPUISD::ATOMIC_LOAD_FMIN:
8722 case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
8723 if (DCI.isBeforeLegalize())
8724 break;
8725 return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
8726 case ISD::AND:
8727 return performAndCombine(N, DCI);
8728 case ISD::OR:
8729 return performOrCombine(N, DCI);
8730 case ISD::XOR:
8731 return performXorCombine(N, DCI);
8732 case ISD::ZERO_EXTEND:
8733 return performZeroExtendCombine(N, DCI);
8734 case AMDGPUISD::FP_CLASS:
8735 return performClassCombine(N, DCI);
8736 case ISD::FCANONICALIZE:
8737 return performFCanonicalizeCombine(N, DCI);
8738 case AMDGPUISD::RCP:
8739 return performRcpCombine(N, DCI);
8740 case AMDGPUISD::FRACT:
8741 case AMDGPUISD::RSQ:
8742 case AMDGPUISD::RCP_LEGACY:
8743 case AMDGPUISD::RSQ_LEGACY:
8744 case AMDGPUISD::RCP_IFLAG:
8745 case AMDGPUISD::RSQ_CLAMP:
8746 case AMDGPUISD::LDEXP: {
8747 SDValue Src = N->getOperand(0);
8748 if (Src.isUndef())
8749 return Src;
8750 break;
8751 }
8752 case ISD::SINT_TO_FP:
8753 case ISD::UINT_TO_FP:
8754 return performUCharToFloatCombine(N, DCI);
8755 case AMDGPUISD::CVT_F32_UBYTE0:
8756 case AMDGPUISD::CVT_F32_UBYTE1:
8757 case AMDGPUISD::CVT_F32_UBYTE2:
8758 case AMDGPUISD::CVT_F32_UBYTE3:
8759 return performCvtF32UByteNCombine(N, DCI);
8760 case AMDGPUISD::FMED3:
8761 return performFMed3Combine(N, DCI);
8762 case AMDGPUISD::CVT_PKRTZ_F16_F32:
8763 return performCvtPkRTZCombine(N, DCI);
8764 case AMDGPUISD::CLAMP:
8765 return performClampCombine(N, DCI);
8766 case ISD::SCALAR_TO_VECTOR: {
8767 SelectionDAG &DAG = DCI.DAG;
8768 EVT VT = N->getValueType(0);
8769
8770 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
8771 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
8772 SDLoc SL(N);
8773 SDValue Src = N->getOperand(0);
8774 EVT EltVT = Src.getValueType();
8775 if (EltVT == MVT::f16)
8776 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
8777
8778 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
8779 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
8780 }
8781
8782 break;
8783 }
8784 case ISD::EXTRACT_VECTOR_ELT:
8785 return performExtractVectorEltCombine(N, DCI);
8786 case ISD::BUILD_VECTOR:
8787 return performBuildVectorCombine(N, DCI);
8788 }
8789 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
8790}
8791
8792/// Helper function for adjustWritemask
8793static unsigned SubIdx2Lane(unsigned Idx) {
8794 switch (Idx) {
8795 default: return 0;
8796 case AMDGPU::sub0: return 0;
8797 case AMDGPU::sub1: return 1;
8798 case AMDGPU::sub2: return 2;
8799 case AMDGPU::sub3: return 3;
8800 }
8801}
8802
8803/// Adjust the writemask of MIMG instructions
8804SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
8805 SelectionDAG &DAG) const {
8806 unsigned Opcode = Node->getMachineOpcode();
8807
8808 // Subtract 1 because the vdata output is not a MachineSDNode operand.
8809 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
8810 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
6
Assuming 'D16Idx' is < 0
8811 return Node; // not implemented for D16
8812
8813 SDNode *Users[4] = { nullptr };
8814 unsigned Lane = 0;
8815 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
8816 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
8817 unsigned NewDmask = 0;
8818 bool HasChain = Node->getNumValues() > 1;
7
Assuming the condition is false
8819
8820 if (OldDmask == 0) {
8
Assuming 'OldDmask' is not equal to 0
9
Taking false branch
8821 // These are folded out, but on the chance it happens don't assert.
8822 return Node;
8823 }
8824
8825 // Try to figure out the used register components
8826 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
10
Loop condition is true. Entering loop body
8827 I != E; ++I) {
8828
8829 // Don't look at users of the chain.
8830 if (I.getUse().getResNo() != 0)
11
Assuming the condition is false
12
Taking false branch
8831 continue;
8832
8833 // Abort if we can't understand the usage
8834 if (!I->isMachineOpcode() ||
13
Taking false branch
8835 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
8836 return Node;
8837
8838 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
8839 // Note that subregs are packed, i.e. Lane==0 is the first bit set
8840 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
8841 // set, etc.
8842 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
8843
8844 // Set which texture component corresponds to the lane.
8845 unsigned Comp;
8846 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
14
Loop condition is true. Entering loop body
20
Loop condition is true. Entering loop body
8847 Comp = countTrailingZeros(Dmask);
15
Calling 'countTrailingZeros<unsigned int>'
19
Returning from 'countTrailingZeros<unsigned int>'
21
Calling 'countTrailingZeros<unsigned int>'
28
Returning from 'countTrailingZeros<unsigned int>'
29
The value 32 is assigned to 'Comp'
8848 Dmask &= ~(1 << Comp);
30
The result of the left shift is undefined due to shifting by '32', which is greater or equal to the width of type 'int'
8849 }
8850
8851 // Abort if we have more than one user per component
8852 if (Users[Lane])
8853 return Node;
8854
8855 Users[Lane] = *I;
8856 NewDmask |= 1 << Comp;
8857 }
8858
8859 // Abort if there's no change
8860 if (NewDmask == OldDmask)
8861 return Node;
8862
8863 unsigned BitsSet = countPopulation(NewDmask);
8864
8865 int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
8866 assert(NewOpcode != -1 &&((NewOpcode != -1 && NewOpcode != static_cast<int>
(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"
) ? static_cast<void> (0) : __assert_fail ("NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && \"failed to find equivalent MIMG op\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 8868, __PRETTY_FUNCTION__))
8867 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&((NewOpcode != -1 && NewOpcode != static_cast<int>
(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"
) ? static_cast<void> (0) : __assert_fail ("NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && \"failed to find equivalent MIMG op\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 8868, __PRETTY_FUNCTION__))
8868 "failed to find equivalent MIMG op")((NewOpcode != -1 && NewOpcode != static_cast<int>
(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"
) ? static_cast<void> (0) : __assert_fail ("NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && \"failed to find equivalent MIMG op\""
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 8868, __PRETTY_FUNCTION__))
;
8869
8870 // Adjust the writemask in the node
8871 SmallVector<SDValue, 12> Ops;
8872 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
8873 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
8874 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
8875
8876 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
8877
8878 MVT ResultVT = BitsSet == 1 ?
8879 SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
8880 SDVTList NewVTList = HasChain ?
8881 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
8882
8883
8884 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
8885 NewVTList, Ops);
8886
8887 if (HasChain) {
8888 // Update chain.
8889 DAG.setNodeMemRefs(NewNode, Node->memoperands());
8890 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
8891 }
8892
8893 if (BitsSet == 1) {
8894 assert(Node->hasNUsesOfValue(1, 0))((Node->hasNUsesOfValue(1, 0)) ? static_cast<void> (
0) : __assert_fail ("Node->hasNUsesOfValue(1, 0)", "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 8894, __PRETTY_FUNCTION__))
;
8895 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
8896 SDLoc(Node), Users[Lane]->getValueType(0),
8897 SDValue(NewNode, 0));
8898 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
8899 return nullptr;
8900 }
8901
8902 // Update the users of the node with the new indices
8903 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
8904 SDNode *User = Users[i];
8905 if (!User)
8906 continue;
8907
8908 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
8909 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
8910
8911 switch (Idx) {
8912 default: break;
8913 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
8914 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
8915 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
8916 }
8917 }
8918
8919 DAG.RemoveDeadNode(Node);
8920 return nullptr;
8921}
8922
8923static bool isFrameIndexOp(SDValue Op) {
8924 if (Op.getOpcode() == ISD::AssertZext)
8925 Op = Op.getOperand(0);
8926
8927 return isa<FrameIndexSDNode>(Op);
8928}
8929
8930/// Legalize target independent instructions (e.g. INSERT_SUBREG)
8931/// with frame index operands.
8932/// LLVM assumes that inputs are to these instructions are registers.
8933SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
8934 SelectionDAG &DAG) const {
8935 if (Node->getOpcode() == ISD::CopyToReg) {
8936 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
8937 SDValue SrcVal = Node->getOperand(2);
8938
8939 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
8940 // to try understanding copies to physical registers.
8941 if (SrcVal.getValueType() == MVT::i1 &&
8942 TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
8943 SDLoc SL(Node);
8944 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
8945 SDValue VReg = DAG.getRegister(
8946 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
8947
8948 SDNode *Glued = Node->getGluedNode();
8949 SDValue ToVReg
8950 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
8951 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
8952 SDValue ToResultReg
8953 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
8954 VReg, ToVReg.getValue(1));
8955 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
8956 DAG.RemoveDeadNode(Node);
8957 return ToResultReg.getNode();
8958 }
8959 }
8960
8961 SmallVector<SDValue, 8> Ops;
8962 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
8963 if (!isFrameIndexOp(Node->getOperand(i))) {
8964 Ops.push_back(Node->getOperand(i));
8965 continue;
8966 }
8967
8968 SDLoc DL(Node);
8969 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
8970 Node->getOperand(i).getValueType(),
8971 Node->getOperand(i)), 0));
8972 }
8973
8974 return DAG.UpdateNodeOperands(Node, Ops);
8975}
8976
8977/// Fold the instructions after selecting them.
8978/// Returns null if users were already updated.
8979SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
8980 SelectionDAG &DAG) const {
8981 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8982 unsigned Opcode = Node->getMachineOpcode();
8983
8984 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
1
Assuming the condition is true
2
Assuming the condition is true
4
Taking true branch
8985 !TII->isGather4(Opcode)) {
3
Assuming the condition is true
8986 return adjustWritemask(Node, DAG);
5
Calling 'SITargetLowering::adjustWritemask'
8987 }
8988
8989 if (Opcode == AMDGPU::INSERT_SUBREG ||
8990 Opcode == AMDGPU::REG_SEQUENCE) {
8991 legalizeTargetIndependentNode(Node, DAG);
8992 return Node;
8993 }
8994
8995 switch (Opcode) {
8996 case AMDGPU::V_DIV_SCALE_F32:
8997 case AMDGPU::V_DIV_SCALE_F64: {
8998 // Satisfy the operand register constraint when one of the inputs is
8999 // undefined. Ordinarily each undef value will have its own implicit_def of
9000 // a vreg, so force these to use a single register.
9001 SDValue Src0 = Node->getOperand(0);
9002 SDValue Src1 = Node->getOperand(1);
9003 SDValue Src2 = Node->getOperand(2);
9004
9005 if ((Src0.isMachineOpcode() &&
9006 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9007 (Src0 == Src1 || Src0 == Src2))
9008 break;
9009
9010 MVT VT = Src0.getValueType().getSimpleVT();
9011 const TargetRegisterClass *RC = getRegClassFor(VT);
9012
9013 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9014 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9015
9016 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9017 UndefReg, Src0, SDValue());
9018
9019 // src0 must be the same register as src1 or src2, even if the value is
9020 // undefined, so make sure we don't violate this constraint.
9021 if (Src0.isMachineOpcode() &&
9022 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9023 if (Src1.isMachineOpcode() &&
9024 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9025 Src0 = Src1;
9026 else if (Src2.isMachineOpcode() &&
9027 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9028 Src0 = Src2;
9029 else {
9030 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)((Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) ? static_cast
<void> (0) : __assert_fail ("Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 9030, __PRETTY_FUNCTION__))
;
9031 Src0 = UndefReg;
9032 Src1 = UndefReg;
9033 }
9034 } else
9035 break;
9036
9037 SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9038 for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9039 Ops.push_back(Node->getOperand(I));
9040
9041 Ops.push_back(ImpDef.getValue(1));
9042 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9043 }
9044 default:
9045 break;
9046 }
9047
9048 return Node;
9049}
9050
9051/// Assign the register class depending on the number of
9052/// bits set in the writemask
9053void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9054 SDNode *Node) const {
9055 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9056
9057 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9058
9059 if (TII->isVOP3(MI.getOpcode())) {
9060 // Make sure constant bus requirements are respected.
9061 TII->legalizeOperandsVOP3(MRI, MI);
9062 return;
9063 }
9064
9065 // Replace unused atomics with the no return version.
9066 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
9067 if (NoRetAtomicOp != -1) {
9068 if (!Node->hasAnyUseOfValue(0)) {
9069 MI.setDesc(TII->get(NoRetAtomicOp));
9070 MI.RemoveOperand(0);
9071 return;
9072 }
9073
9074 // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9075 // instruction, because the return type of these instructions is a vec2 of
9076 // the memory type, so it can be tied to the input operand.
9077 // This means these instructions always have a use, so we need to add a
9078 // special case to check if the atomic has only one extract_subreg use,
9079 // which itself has no uses.
9080 if ((Node->hasNUsesOfValue(1, 0) &&
9081 Node->use_begin()->isMachineOpcode() &&
9082 Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9083 !Node->use_begin()->hasAnyUseOfValue(0))) {
9084 unsigned Def = MI.getOperand(0).getReg();
9085
9086 // Change this into a noret atomic.
9087 MI.setDesc(TII->get(NoRetAtomicOp));
9088 MI.RemoveOperand(0);
9089
9090 // If we only remove the def operand from the atomic instruction, the
9091 // extract_subreg will be left with a use of a vreg without a def.
9092 // So we need to insert an implicit_def to avoid machine verifier
9093 // errors.
9094 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
9095 TII->get(AMDGPU::IMPLICIT_DEF), Def);
9096 }
9097 return;
9098 }
9099}
9100
9101static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9102 uint64_t Val) {
9103 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
9104 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9105}
9106
9107MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
9108 const SDLoc &DL,
9109 SDValue Ptr) const {
9110 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9111
9112 // Build the half of the subregister with the constants before building the
9113 // full 128-bit register. If we are building multiple resource descriptors,
9114 // this will allow CSEing of the 2-component register.
9115 const SDValue Ops0[] = {
9116 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9117 buildSMovImm32(DAG, DL, 0),
9118 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9119 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9120 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9121 };
9122
9123 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9124 MVT::v2i32, Ops0), 0);
9125
9126 // Combine the constants and the pointer.
9127 const SDValue Ops1[] = {
9128 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9129 Ptr,
9130 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9131 SubRegHi,
9132 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9133 };
9134
9135 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
9136}
9137
9138/// Return a resource descriptor with the 'Add TID' bit enabled
9139/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9140/// of the resource descriptor) to create an offset, which is added to
9141/// the resource pointer.
9142MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
9143 SDValue Ptr, uint32_t RsrcDword1,
9144 uint64_t RsrcDword2And3) const {
9145 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9146 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9147 if (RsrcDword1) {
9148 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
9149 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9150 0);
9151 }
9152
9153 SDValue DataLo = buildSMovImm32(DAG, DL,
9154 RsrcDword2And3 & UINT64_C(0xFFFFFFFF)0xFFFFFFFFUL);
9155 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9156
9157 const SDValue Ops[] = {
9158 DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9159 PtrLo,
9160 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9161 PtrHi,
9162 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
9163 DataLo,
9164 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
9165 DataHi,
9166 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
9167 };
9168
9169 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9170}
9171
9172//===----------------------------------------------------------------------===//
9173// SI Inline Assembly Support
9174//===----------------------------------------------------------------------===//
9175
9176std::pair<unsigned, const TargetRegisterClass *>
9177SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
9178 StringRef Constraint,
9179 MVT VT) const {
9180 const TargetRegisterClass *RC = nullptr;
9181 if (Constraint.size() == 1) {
9182 switch (Constraint[0]) {
9183 default:
9184 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9185 case 's':
9186 case 'r':
9187 switch (VT.getSizeInBits()) {
9188 default:
9189 return std::make_pair(0U, nullptr);
9190 case 32:
9191 case 16:
9192 RC = &AMDGPU::SReg_32_XM0RegClass;
9193 break;
9194 case 64:
9195 RC = &AMDGPU::SGPR_64RegClass;
9196 break;
9197 case 128:
9198 RC = &AMDGPU::SReg_128RegClass;
9199 break;
9200 case 256:
9201 RC = &AMDGPU::SReg_256RegClass;
9202 break;
9203 case 512:
9204 RC = &AMDGPU::SReg_512RegClass;
9205 break;
9206 }
9207 break;
9208 case 'v':
9209 switch (VT.getSizeInBits()) {
9210 default:
9211 return std::make_pair(0U, nullptr);
9212 case 32:
9213 case 16:
9214 RC = &AMDGPU::VGPR_32RegClass;
9215 break;
9216 case 64:
9217 RC = &AMDGPU::VReg_64RegClass;
9218 break;
9219 case 96:
9220 RC = &AMDGPU::VReg_96RegClass;
9221 break;
9222 case 128:
9223 RC = &AMDGPU::VReg_128RegClass;
9224 break;
9225 case 256:
9226 RC = &AMDGPU::VReg_256RegClass;
9227 break;
9228 case 512:
9229 RC = &AMDGPU::VReg_512RegClass;
9230 break;
9231 }
9232 break;
9233 }
9234 // We actually support i128, i16 and f16 as inline parameters
9235 // even if they are not reported as legal
9236 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9237 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9238 return std::make_pair(0U, RC);
9239 }
9240
9241 if (Constraint.size() > 1) {
9242 if (Constraint[1] == 'v') {
9243 RC = &AMDGPU::VGPR_32RegClass;
9244 } else if (Constraint[1] == 's') {
9245 RC = &AMDGPU::SGPR_32RegClass;
9246 }
9247
9248 if (RC) {
9249 uint32_t Idx;
9250 bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9251 if (!Failed && Idx < RC->getNumRegs())
9252 return std::make_pair(RC->getRegister(Idx), RC);
9253 }
9254 }
9255 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9256}
9257
9258SITargetLowering::ConstraintType
9259SITargetLowering::getConstraintType(StringRef Constraint) const {
9260 if (Constraint.size() == 1) {
9261 switch (Constraint[0]) {
9262 default: break;
9263 case 's':
9264 case 'v':
9265 return C_RegisterClass;
9266 }
9267 }
9268 return TargetLowering::getConstraintType(Constraint);
9269}
9270
9271// Figure out which registers should be reserved for stack access. Only after
9272// the function is legalized do we know all of the non-spill stack objects or if
9273// calls are present.
9274void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
9275 MachineRegisterInfo &MRI = MF.getRegInfo();
9276 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9277 const MachineFrameInfo &MFI = MF.getFrameInfo();
9278 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
9279
9280 if (Info->isEntryFunction()) {
9281 // Callable functions have fixed registers used for stack access.
9282 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9283 }
9284
9285 // We have to assume the SP is needed in case there are calls in the function
9286 // during lowering. Calls are only detected after the function is
9287 // lowered. We're about to reserve registers, so don't bother using it if we
9288 // aren't really going to use it.
9289 bool NeedSP = !Info->isEntryFunction() ||
9290 MFI.hasVarSizedObjects() ||
9291 MFI.hasCalls();
9292
9293 if (NeedSP) {
9294 unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
9295 Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
9296
9297 assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg())((Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg
()) ? static_cast<void> (0) : __assert_fail ("Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 9297, __PRETTY_FUNCTION__))
;
9298 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),((!TRI->isSubRegister(Info->getScratchRSrcReg(), Info->
getStackPtrOffsetReg())) ? static_cast<void> (0) : __assert_fail
("!TRI->isSubRegister(Info->getScratchRSrcReg(), Info->getStackPtrOffsetReg())"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 9299, __PRETTY_FUNCTION__))
9299 Info->getStackPtrOffsetReg()))((!TRI->isSubRegister(Info->getScratchRSrcReg(), Info->
getStackPtrOffsetReg())) ? static_cast<void> (0) : __assert_fail
("!TRI->isSubRegister(Info->getScratchRSrcReg(), Info->getStackPtrOffsetReg())"
, "/build/llvm-toolchain-snapshot-8~svn345461/lib/Target/AMDGPU/SIISelLowering.cpp"
, 9299, __PRETTY_FUNCTION__))
;
9300 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
9301 }
9302
9303 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9304 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9305 MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9306 Info->getScratchWaveOffsetReg());
9307
9308 Info->limitOccupancy(MF);
9309
9310 TargetLoweringBase::finalizeLowering(MF);
9311}
9312
9313void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
9314 KnownBits &Known,
9315 const APInt &DemandedElts,
9316 const SelectionDAG &DAG,
9317 unsigned Depth) const {
9318 TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9319 DAG, Depth);
9320
9321 if (getSubtarget()->enableHugePrivateBuffer())
9322 return;
9323
9324 // Technically it may be possible to have a dispatch with a single workitem
9325 // that uses the full private memory size, but that's not really useful. We
9326 // can't use vaddr in MUBUF instructions if we don't know the address
9327 // calculation won't overflow, so assume the sign bit is never set.
9328 Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
9329}
9330
9331bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
9332 FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
9333{
9334 switch (N->getOpcode()) {
9335 case ISD::Register:
9336 case ISD::CopyFromReg:
9337 {
9338 const RegisterSDNode *R = nullptr;
9339 if (N->getOpcode() == ISD::Register) {
9340 R = dyn_cast<RegisterSDNode>(N);
9341 }
9342 else {
9343 R = dyn_cast<RegisterSDNode>(N->getOperand(1));
9344 }
9345 if (R)
9346 {
9347 const MachineFunction * MF = FLI->MF;
9348 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
9349 const MachineRegisterInfo &MRI = MF->getRegInfo();
9350 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
9351 unsigned Reg = R->getReg();
9352 if (TRI.isPhysicalRegister(Reg))
9353 return TRI.isVGPR(MRI, Reg);
9354
9355 if (MRI.isLiveIn(Reg)) {
9356 // workitem.id.x workitem.id.y workitem.id.z
9357 // Any VGPR formal argument is also considered divergent
9358 if (TRI.isVGPR(MRI, Reg))
9359 return true;
9360 // Formal arguments of non-entry functions
9361 // are conservatively considered divergent
9362 else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
9363 return true;
9364 }
9365 return !KDA || KDA->isDivergent(FLI->getValueFromVirtualReg(Reg));
9366 }
9367 }
9368 break;
9369 case ISD::LOAD: {
9370 const LoadSDNode *L = cast<LoadSDNode>(N);
9371 unsigned AS = L->getAddressSpace();
9372 // A flat load may access private memory.
9373 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
9374 } break;
9375 case ISD::CALLSEQ_END:
9376 return true;
9377 break;
9378 case ISD::INTRINSIC_WO_CHAIN:
9379 {
9380
9381 }
9382 return AMDGPU::isIntrinsicSourceOfDivergence(
9383 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
9384 case ISD::INTRINSIC_W_CHAIN:
9385 return AMDGPU::isIntrinsicSourceOfDivergence(
9386 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
9387 // In some cases intrinsics that are a source of divergence have been
9388 // lowered to AMDGPUISD so we also need to check those too.
9389 case AMDGPUISD::INTERP_MOV:
9390 case AMDGPUISD::INTERP_P1:
9391 case AMDGPUISD::INTERP_P2:
9392 return true;
9393 }
9394 return false;
9395}
9396
9397bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
9398 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
9399 case MVT::f32:
9400 return Subtarget->hasFP32Denormals();
9401 case MVT::f64:
9402 return Subtarget->hasFP64Denormals();
9403 case MVT::f16:
9404 return Subtarget->hasFP16Denormals();
9405 default:
9406 return false;
9407 }
9408}
9409
9410bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
9411 const SelectionDAG &DAG,
9412 bool SNaN,
9413 unsigned Depth) const {
9414 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
9415 if (Subtarget->enableDX10Clamp())
9416 return true; // Clamped to 0.
9417 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
9418 }
9419
9420 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
9421 SNaN, Depth);
9422}

/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h

1//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file contains some functions that are useful for math stuff.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_SUPPORT_MATHEXTRAS_H
15#define LLVM_SUPPORT_MATHEXTRAS_H
16
17#include "llvm/Support/Compiler.h"
18#include "llvm/Support/SwapByteOrder.h"
19#include <algorithm>
20#include <cassert>
21#include <climits>
22#include <cstring>
23#include <limits>
24#include <type_traits>
25
26#ifdef __ANDROID_NDK__
27#include <android/api-level.h>
28#endif
29
30#ifdef _MSC_VER
31// Declare these intrinsics manually rather including intrin.h. It's very
32// expensive, and MathExtras.h is popular.
33// #include <intrin.h>
34extern "C" {
35unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
36unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
37unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
38unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
39}
40#endif
41
42namespace llvm {
43/// The behavior an operation has on an input of 0.
44enum ZeroBehavior {
45 /// The returned value is undefined.
46 ZB_Undefined,
47 /// The returned value is numeric_limits<T>::max()
48 ZB_Max,
49 /// The returned value is numeric_limits<T>::digits
50 ZB_Width
51};
52
53namespace detail {
54template <typename T, std::size_t SizeOfT> struct TrailingZerosCounter {
55 static std::size_t count(T Val, ZeroBehavior) {
56 if (!Val)
57 return std::numeric_limits<T>::digits;
58 if (Val & 0x1)
59 return 0;
60
61 // Bisection method.
62 std::size_t ZeroBits = 0;
63 T Shift = std::numeric_limits<T>::digits >> 1;
64 T Mask = std::numeric_limits<T>::max() >> Shift;
65 while (Shift) {
66 if ((Val & Mask) == 0) {
67 Val >>= Shift;
68 ZeroBits |= Shift;
69 }
70 Shift >>= 1;
71 Mask >>= Shift;
72 }
73 return ZeroBits;
74 }
75};
76
77#if __GNUC__4 >= 4 || defined(_MSC_VER)
78template <typename T> struct TrailingZerosCounter<T, 4> {
79 static std::size_t count(T Val, ZeroBehavior ZB) {
80 if (ZB != ZB_Undefined && Val == 0)
17
Taking false branch
23
Assuming 'Val' is equal to 0
24
Taking true branch
81 return 32;
25
Returning the value 32
82
83#if __has_builtin(__builtin_ctz)1 || LLVM_GNUC_PREREQ(4, 0, 0)((4 << 20) + (2 << 10) + 1 >= ((4) << 20
) + ((0) << 10) + (0))
84 return __builtin_ctz(Val);
85#elif defined(_MSC_VER)
86 unsigned long Index;
87 _BitScanForward(&Index, Val);
88 return Index;
89#endif
90 }
91};
92
93#if !defined(_MSC_VER) || defined(_M_X64)
94template <typename T> struct TrailingZerosCounter<T, 8> {
95 static std::size_t count(T Val, ZeroBehavior ZB) {
96 if (ZB != ZB_Undefined && Val == 0)
97 return 64;
98
99#if __has_builtin(__builtin_ctzll)1 || LLVM_GNUC_PREREQ(4, 0, 0)((4 << 20) + (2 << 10) + 1 >= ((4) << 20
) + ((0) << 10) + (0))
100 return __builtin_ctzll(Val);
101#elif defined(_MSC_VER)
102 unsigned long Index;
103 _BitScanForward64(&Index, Val);
104 return Index;
105#endif
106 }
107};
108#endif
109#endif
110} // namespace detail
111
112/// Count number of 0's from the least significant bit to the most
113/// stopping at the first 1.
114///
115/// Only unsigned integral types are allowed.
116///
117/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
118/// valid arguments.
119template <typename T>
120std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
121 static_assert(std::numeric_limits<T>::is_integer &&
122 !std::numeric_limits<T>::is_signed,
123 "Only unsigned integral types are allowed.");
124 return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
16
Calling 'TrailingZerosCounter::count'
18
Returning from 'TrailingZerosCounter::count'
22
Calling 'TrailingZerosCounter::count'
26
Returning from 'TrailingZerosCounter::count'
27
Returning the value 32
125}
126
127namespace detail {
128template <typename T, std::size_t SizeOfT> struct LeadingZerosCounter {
129 static std::size_t count(T Val, ZeroBehavior) {
130 if (!Val)
131 return std::numeric_limits<T>::digits;
132
133 // Bisection method.
134 std::size_t ZeroBits = 0;
135 for (T Shift = std::numeric_limits<T>::digits >> 1; Shift; Shift >>= 1) {
136 T Tmp = Val >> Shift;
137 if (Tmp)
138 Val = Tmp;
139 else
140 ZeroBits |= Shift;
141 }
142 return ZeroBits;
143 }
144};
145
146#if __GNUC__4 >= 4 || defined(_MSC_VER)
147template <typename T> struct LeadingZerosCounter<T, 4> {
148 static std::size_t count(T Val, ZeroBehavior ZB) {
149 if (ZB != ZB_Undefined && Val == 0)
150 return 32;
151
152#if __has_builtin(__builtin_clz)1 || LLVM_GNUC_PREREQ(4, 0, 0)((4 << 20) + (2 << 10) + 1 >= ((4) << 20
) + ((0) << 10) + (0))
153 return __builtin_clz(Val);
154#elif defined(_MSC_VER)
155 unsigned long Index;
156 _BitScanReverse(&Index, Val);
157 return Index ^ 31;
158#endif
159 }
160};
161
162#if !defined(_MSC_VER) || defined(_M_X64)
163template <typename T> struct LeadingZerosCounter<T, 8> {
164 static std::size_t count(T Val, ZeroBehavior ZB) {
165 if (ZB != ZB_Undefined && Val == 0)
166 return 64;
167
168#if __has_builtin(__builtin_clzll)1 || LLVM_GNUC_PREREQ(4, 0, 0)((4 << 20) + (2 << 10) + 1 >= ((4) << 20
) + ((0) << 10) + (0))
169 return __builtin_clzll(Val);
170#elif defined(_MSC_VER)
171 unsigned long Index;
172 _BitScanReverse64(&Index, Val);
173 return Index ^ 63;
174#endif
175 }
176};
177#endif
178#endif
179} // namespace detail
180
181/// Count number of 0's from the most significant bit to the least
182/// stopping at the first 1.
183///
184/// Only unsigned integral types are allowed.
185///
186/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are
187/// valid arguments.
188template <typename T>
189std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
190 static_assert(std::numeric_limits<T>::is_integer &&
191 !std::numeric_limits<T>::is_signed,
192 "Only unsigned integral types are allowed.");
193 return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
194}
195
196/// Get the index of the first set bit starting from the least
197/// significant bit.
198///
199/// Only unsigned integral types are allowed.
200///
201/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
202/// valid arguments.
203template <typename T> T findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) {
204 if (ZB == ZB_Max && Val == 0)
205 return std::numeric_limits<T>::max();
206
207 return countTrailingZeros(Val, ZB_Undefined);
208}
209
210/// Create a bitmask with the N right-most bits set to 1, and all other
211/// bits set to 0. Only unsigned types are allowed.
212template <typename T> T maskTrailingOnes(unsigned N) {
213 static_assert(std::is_unsigned<T>::value, "Invalid type!");
214 const unsigned Bits = CHAR_BIT8 * sizeof(T);
215 assert(N <= Bits && "Invalid bit index")((N <= Bits && "Invalid bit index") ? static_cast<
void> (0) : __assert_fail ("N <= Bits && \"Invalid bit index\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 215, __PRETTY_FUNCTION__))
;
216 return N == 0 ? 0 : (T(-1) >> (Bits - N));
217}
218
219/// Create a bitmask with the N left-most bits set to 1, and all other
220/// bits set to 0. Only unsigned types are allowed.
221template <typename T> T maskLeadingOnes(unsigned N) {
222 return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
223}
224
225/// Create a bitmask with the N right-most bits set to 0, and all other
226/// bits set to 1. Only unsigned types are allowed.
227template <typename T> T maskTrailingZeros(unsigned N) {
228 return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
229}
230
231/// Create a bitmask with the N left-most bits set to 0, and all other
232/// bits set to 1. Only unsigned types are allowed.
233template <typename T> T maskLeadingZeros(unsigned N) {
234 return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N);
235}
236
237/// Get the index of the last set bit starting from the least
238/// significant bit.
239///
240/// Only unsigned integral types are allowed.
241///
242/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are
243/// valid arguments.
244template <typename T> T findLastSet(T Val, ZeroBehavior ZB = ZB_Max) {
245 if (ZB == ZB_Max && Val == 0)
246 return std::numeric_limits<T>::max();
247
248 // Use ^ instead of - because both gcc and llvm can remove the associated ^
249 // in the __builtin_clz intrinsic on x86.
250 return countLeadingZeros(Val, ZB_Undefined) ^
251 (std::numeric_limits<T>::digits - 1);
252}
253
254/// Macro compressed bit reversal table for 256 bits.
255///
256/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
257static const unsigned char BitReverseTable256[256] = {
258#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64
259#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16)
260#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4)
261 R6(0), R6(2), R6(1), R6(3)
262#undef R2
263#undef R4
264#undef R6
265};
266
267/// Reverse the bits in \p Val.
268template <typename T>
269T reverseBits(T Val) {
270 unsigned char in[sizeof(Val)];
271 unsigned char out[sizeof(Val)];
272 std::memcpy(in, &Val, sizeof(Val));
273 for (unsigned i = 0; i < sizeof(Val); ++i)
274 out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]];
275 std::memcpy(&Val, out, sizeof(Val));
276 return Val;
277}
278
279// NOTE: The following support functions use the _32/_64 extensions instead of
280// type overloading so that signed and unsigned integers can be used without
281// ambiguity.
282
283/// Return the high 32 bits of a 64 bit value.
284constexpr inline uint32_t Hi_32(uint64_t Value) {
285 return static_cast<uint32_t>(Value >> 32);
286}
287
288/// Return the low 32 bits of a 64 bit value.
289constexpr inline uint32_t Lo_32(uint64_t Value) {
290 return static_cast<uint32_t>(Value);
291}
292
293/// Make a 64-bit integer from a high / low pair of 32-bit integers.
294constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) {
295 return ((uint64_t)High << 32) | (uint64_t)Low;
296}
297
298/// Checks if an integer fits into the given bit width.
299template <unsigned N> constexpr inline bool isInt(int64_t x) {
300 return N >= 64 || (-(INT64_C(1)1L<<(N-1)) <= x && x < (INT64_C(1)1L<<(N-1)));
301}
302// Template specializations to get better code for common cases.
303template <> constexpr inline bool isInt<8>(int64_t x) {
304 return static_cast<int8_t>(x) == x;
305}
306template <> constexpr inline bool isInt<16>(int64_t x) {
307 return static_cast<int16_t>(x) == x;
308}
309template <> constexpr inline bool isInt<32>(int64_t x) {
310 return static_cast<int32_t>(x) == x;
311}
312
313/// Checks if a signed integer is an N bit number shifted left by S.
314template <unsigned N, unsigned S>
315constexpr inline bool isShiftedInt(int64_t x) {
316 static_assert(
317 N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number.");
318 static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide.");
319 return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
320}
321
322/// Checks if an unsigned integer fits into the given bit width.
323///
324/// This is written as two functions rather than as simply
325///
326/// return N >= 64 || X < (UINT64_C(1) << N);
327///
328/// to keep MSVC from (incorrectly) warning on isUInt<64> that we're shifting
329/// left too many places.
330template <unsigned N>
331constexpr inline typename std::enable_if<(N < 64), bool>::type
332isUInt(uint64_t X) {
333 static_assert(N > 0, "isUInt<0> doesn't make sense");
334 return X < (UINT64_C(1)1UL << (N));
335}
336template <unsigned N>
337constexpr inline typename std::enable_if<N >= 64, bool>::type
338isUInt(uint64_t X) {
339 return true;
340}
341
342// Template specializations to get better code for common cases.
343template <> constexpr inline bool isUInt<8>(uint64_t x) {
344 return static_cast<uint8_t>(x) == x;
345}
346template <> constexpr inline bool isUInt<16>(uint64_t x) {
347 return static_cast<uint16_t>(x) == x;
348}
349template <> constexpr inline bool isUInt<32>(uint64_t x) {
350 return static_cast<uint32_t>(x) == x;
351}
352
353/// Checks if a unsigned integer is an N bit number shifted left by S.
354template <unsigned N, unsigned S>
355constexpr inline bool isShiftedUInt(uint64_t x) {
356 static_assert(
357 N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)");
358 static_assert(N + S <= 64,
359 "isShiftedUInt<N, S> with N + S > 64 is too wide.");
360 // Per the two static_asserts above, S must be strictly less than 64. So
361 // 1 << S is not undefined behavior.
362 return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0);
363}
364
365/// Gets the maximum value for a N-bit unsigned integer.
366inline uint64_t maxUIntN(uint64_t N) {
367 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 367, __PRETTY_FUNCTION__))
;
368
369 // uint64_t(1) << 64 is undefined behavior, so we can't do
370 // (uint64_t(1) << N) - 1
371 // without checking first that N != 64. But this works and doesn't have a
372 // branch.
373 return UINT64_MAX(18446744073709551615UL) >> (64 - N);
374}
375
376/// Gets the minimum value for a N-bit signed integer.
377inline int64_t minIntN(int64_t N) {
378 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 378, __PRETTY_FUNCTION__))
;
379
380 return -(UINT64_C(1)1UL<<(N-1));
381}
382
383/// Gets the maximum value for a N-bit signed integer.
384inline int64_t maxIntN(int64_t N) {
385 assert(N > 0 && N <= 64 && "integer width out of range")((N > 0 && N <= 64 && "integer width out of range"
) ? static_cast<void> (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 385, __PRETTY_FUNCTION__))
;
386
387 // This relies on two's complement wraparound when N == 64, so we convert to
388 // int64_t only at the very end to avoid UB.
389 return (UINT64_C(1)1UL << (N - 1)) - 1;
390}
391
392/// Checks if an unsigned integer fits into the given (dynamic) bit width.
393inline bool isUIntN(unsigned N, uint64_t x) {
394 return N >= 64 || x <= maxUIntN(N);
395}
396
397/// Checks if an signed integer fits into the given (dynamic) bit width.
398inline bool isIntN(unsigned N, int64_t x) {
399 return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N));
400}
401
402/// Return true if the argument is a non-empty sequence of ones starting at the
403/// least significant bit with the remainder zero (32 bit version).
404/// Ex. isMask_32(0x0000FFFFU) == true.
405constexpr inline bool isMask_32(uint32_t Value) {
406 return Value && ((Value + 1) & Value) == 0;
407}
408
409/// Return true if the argument is a non-empty sequence of ones starting at the
410/// least significant bit with the remainder zero (64 bit version).
411constexpr inline bool isMask_64(uint64_t Value) {
412 return Value && ((Value + 1) & Value) == 0;
413}
414
415/// Return true if the argument contains a non-empty sequence of ones with the
416/// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true.
417constexpr inline bool isShiftedMask_32(uint32_t Value) {
418 return Value && isMask_32((Value - 1) | Value);
419}
420
421/// Return true if the argument contains a non-empty sequence of ones with the
422/// remainder zero (64 bit version.)
423constexpr inline bool isShiftedMask_64(uint64_t Value) {
424 return Value && isMask_64((Value - 1) | Value);
425}
426
427/// Return true if the argument is a power of two > 0.
428/// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.)
429constexpr inline bool isPowerOf2_32(uint32_t Value) {
430 return Value && !(Value & (Value - 1));
431}
432
433/// Return true if the argument is a power of two > 0 (64 bit edition.)
434constexpr inline bool isPowerOf2_64(uint64_t Value) {
435 return Value && !(Value & (Value - 1));
436}
437
438/// Return a byte-swapped representation of the 16-bit argument.
439inline uint16_t ByteSwap_16(uint16_t Value) {
440 return sys::SwapByteOrder_16(Value);
441}
442
443/// Return a byte-swapped representation of the 32-bit argument.
444inline uint32_t ByteSwap_32(uint32_t Value) {
445 return sys::SwapByteOrder_32(Value);
446}
447
448/// Return a byte-swapped representation of the 64-bit argument.
449inline uint64_t ByteSwap_64(uint64_t Value) {
450 return sys::SwapByteOrder_64(Value);
451}
452
453/// Count the number of ones from the most significant bit to the first
454/// zero bit.
455///
456/// Ex. countLeadingOnes(0xFF0FFF00) == 8.
457/// Only unsigned integral types are allowed.
458///
459/// \param ZB the behavior on an input of all ones. Only ZB_Width and
460/// ZB_Undefined are valid arguments.
461template <typename T>
462std::size_t countLeadingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
463 static_assert(std::numeric_limits<T>::is_integer &&
464 !std::numeric_limits<T>::is_signed,
465 "Only unsigned integral types are allowed.");
466 return countLeadingZeros<T>(~Value, ZB);
467}
468
469/// Count the number of ones from the least significant bit to the first
470/// zero bit.
471///
472/// Ex. countTrailingOnes(0x00FF00FF) == 8.
473/// Only unsigned integral types are allowed.
474///
475/// \param ZB the behavior on an input of all ones. Only ZB_Width and
476/// ZB_Undefined are valid arguments.
477template <typename T>
478std::size_t countTrailingOnes(T Value, ZeroBehavior ZB = ZB_Width) {
479 static_assert(std::numeric_limits<T>::is_integer &&
480 !std::numeric_limits<T>::is_signed,
481 "Only unsigned integral types are allowed.");
482 return countTrailingZeros<T>(~Value, ZB);
483}
484
485namespace detail {
486template <typename T, std::size_t SizeOfT> struct PopulationCounter {
487 static unsigned count(T Value) {
488 // Generic version, forward to 32 bits.
489 static_assert(SizeOfT <= 4, "Not implemented!");
490#if __GNUC__4 >= 4
491 return __builtin_popcount(Value);
492#else
493 uint32_t v = Value;
494 v = v - ((v >> 1) & 0x55555555);
495 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
496 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
497#endif
498 }
499};
500
501template <typename T> struct PopulationCounter<T, 8> {
502 static unsigned count(T Value) {
503#if __GNUC__4 >= 4
504 return __builtin_popcountll(Value);
505#else
506 uint64_t v = Value;
507 v = v - ((v >> 1) & 0x5555555555555555ULL);
508 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
509 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
510 return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
511#endif
512 }
513};
514} // namespace detail
515
516/// Count the number of set bits in a value.
517/// Ex. countPopulation(0xF000F000) = 8
518/// Returns 0 if the word is zero.
519template <typename T>
520inline unsigned countPopulation(T Value) {
521 static_assert(std::numeric_limits<T>::is_integer &&
522 !std::numeric_limits<T>::is_signed,
523 "Only unsigned integral types are allowed.");
524 return detail::PopulationCounter<T, sizeof(T)>::count(Value);
525}
526
527/// Return the log base 2 of the specified value.
528inline double Log2(double Value) {
529#if defined(__ANDROID_API__) && __ANDROID_API__ < 18
530 return __builtin_log(Value) / __builtin_log(2.0);
531#else
532 return log2(Value);
533#endif
534}
535
536/// Return the floor log base 2 of the specified value, -1 if the value is zero.
537/// (32 bit edition.)
538/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2
539inline unsigned Log2_32(uint32_t Value) {
540 return 31 - countLeadingZeros(Value);
541}
542
543/// Return the floor log base 2 of the specified value, -1 if the value is zero.
544/// (64 bit edition.)
545inline unsigned Log2_64(uint64_t Value) {
546 return 63 - countLeadingZeros(Value);
547}
548
549/// Return the ceil log base 2 of the specified value, 32 if the value is zero.
550/// (32 bit edition).
551/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3
552inline unsigned Log2_32_Ceil(uint32_t Value) {
553 return 32 - countLeadingZeros(Value - 1);
554}
555
556/// Return the ceil log base 2 of the specified value, 64 if the value is zero.
557/// (64 bit edition.)
558inline unsigned Log2_64_Ceil(uint64_t Value) {
559 return 64 - countLeadingZeros(Value - 1);
560}
561
562/// Return the greatest common divisor of the values using Euclid's algorithm.
563inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) {
564 while (B) {
565 uint64_t T = B;
566 B = A % B;
567 A = T;
568 }
569 return A;
570}
571
572/// This function takes a 64-bit integer and returns the bit equivalent double.
573inline double BitsToDouble(uint64_t Bits) {
574 double D;
575 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
576 memcpy(&D, &Bits, sizeof(Bits));
577 return D;
578}
579
580/// This function takes a 32-bit integer and returns the bit equivalent float.
581inline float BitsToFloat(uint32_t Bits) {
582 float F;
583 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
584 memcpy(&F, &Bits, sizeof(Bits));
585 return F;
586}
587
588/// This function takes a double and returns the bit equivalent 64-bit integer.
589/// Note that copying doubles around changes the bits of NaNs on some hosts,
590/// notably x86, so this routine cannot be used if these bits are needed.
591inline uint64_t DoubleToBits(double Double) {
592 uint64_t Bits;
593 static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes");
594 memcpy(&Bits, &Double, sizeof(Double));
595 return Bits;
596}
597
598/// This function takes a float and returns the bit equivalent 32-bit integer.
599/// Note that copying floats around changes the bits of NaNs on some hosts,
600/// notably x86, so this routine cannot be used if these bits are needed.
601inline uint32_t FloatToBits(float Float) {
602 uint32_t Bits;
603 static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes");
604 memcpy(&Bits, &Float, sizeof(Float));
605 return Bits;
606}
607
608/// A and B are either alignments or offsets. Return the minimum alignment that
609/// may be assumed after adding the two together.
610constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) {
611 // The largest power of 2 that divides both A and B.
612 //
613 // Replace "-Value" by "1+~Value" in the following commented code to avoid
614 // MSVC warning C4146
615 // return (A | B) & -(A | B);
616 return (A | B) & (1 + ~(A | B));
617}
618
619/// Aligns \c Addr to \c Alignment bytes, rounding up.
620///
621/// Alignment should be a power of two. This method rounds up, so
622/// alignAddr(7, 4) == 8 and alignAddr(8, 4) == 8.
623inline uintptr_t alignAddr(const void *Addr, size_t Alignment) {
624 assert(Alignment && isPowerOf2_64((uint64_t)Alignment) &&((Alignment && isPowerOf2_64((uint64_t)Alignment) &&
"Alignment is not a power of two!") ? static_cast<void>
(0) : __assert_fail ("Alignment && isPowerOf2_64((uint64_t)Alignment) && \"Alignment is not a power of two!\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 625, __PRETTY_FUNCTION__))
625 "Alignment is not a power of two!")((Alignment && isPowerOf2_64((uint64_t)Alignment) &&
"Alignment is not a power of two!") ? static_cast<void>
(0) : __assert_fail ("Alignment && isPowerOf2_64((uint64_t)Alignment) && \"Alignment is not a power of two!\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 625, __PRETTY_FUNCTION__))
;
626
627 assert((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr)(((uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr) ? static_cast
<void> (0) : __assert_fail ("(uintptr_t)Addr + Alignment - 1 >= (uintptr_t)Addr"
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 627, __PRETTY_FUNCTION__))
;
628
629 return (((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1));
630}
631
632/// Returns the necessary adjustment for aligning \c Ptr to \c Alignment
633/// bytes, rounding up.
634inline size_t alignmentAdjustment(const void *Ptr, size_t Alignment) {
635 return alignAddr(Ptr, Alignment) - (uintptr_t)Ptr;
636}
637
638/// Returns the next power of two (in 64-bits) that is strictly greater than A.
639/// Returns zero on overflow.
640inline uint64_t NextPowerOf2(uint64_t A) {
641 A |= (A >> 1);
642 A |= (A >> 2);
643 A |= (A >> 4);
644 A |= (A >> 8);
645 A |= (A >> 16);
646 A |= (A >> 32);
647 return A + 1;
648}
649
650/// Returns the power of two which is less than or equal to the given value.
651/// Essentially, it is a floor operation across the domain of powers of two.
652inline uint64_t PowerOf2Floor(uint64_t A) {
653 if (!A) return 0;
654 return 1ull << (63 - countLeadingZeros(A, ZB_Undefined));
655}
656
657/// Returns the power of two which is greater than or equal to the given value.
658/// Essentially, it is a ceil operation across the domain of powers of two.
659inline uint64_t PowerOf2Ceil(uint64_t A) {
660 if (!A)
661 return 0;
662 return NextPowerOf2(A - 1);
663}
664
665/// Returns the next integer (mod 2**64) that is greater than or equal to
666/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
667///
668/// If non-zero \p Skew is specified, the return value will be a minimal
669/// integer that is greater than or equal to \p Value and equal to
670/// \p Align * N + \p Skew for some integer N. If \p Skew is larger than
671/// \p Align, its value is adjusted to '\p Skew mod \p Align'.
672///
673/// Examples:
674/// \code
675/// alignTo(5, 8) = 8
676/// alignTo(17, 8) = 24
677/// alignTo(~0LL, 8) = 0
678/// alignTo(321, 255) = 510
679///
680/// alignTo(5, 8, 7) = 7
681/// alignTo(17, 8, 1) = 17
682/// alignTo(~0LL, 8, 3) = 3
683/// alignTo(321, 255, 42) = 552
684/// \endcode
685inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
686 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 686, __PRETTY_FUNCTION__))
;
687 Skew %= Align;
688 return (Value + Align - 1 - Skew) / Align * Align + Skew;
689}
690
691/// Returns the next integer (mod 2**64) that is greater than or equal to
692/// \p Value and is a multiple of \c Align. \c Align must be non-zero.
693template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) {
694 static_assert(Align != 0u, "Align must be non-zero");
695 return (Value + Align - 1) / Align * Align;
696}
697
698/// Returns the integer ceil(Numerator / Denominator).
699inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
700 return alignTo(Numerator, Denominator) / Denominator;
701}
702
703/// \c alignTo for contexts where a constant expression is required.
704/// \sa alignTo
705///
706/// \todo FIXME: remove when \c constexpr becomes really \c constexpr
707template <uint64_t Align>
708struct AlignTo {
709 static_assert(Align != 0u, "Align must be non-zero");
710 template <uint64_t Value>
711 struct from_value {
712 static const uint64_t value = (Value + Align - 1) / Align * Align;
713 };
714};
715
716/// Returns the largest uint64_t less than or equal to \p Value and is
717/// \p Skew mod \p Align. \p Align must be non-zero
718inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
719 assert(Align != 0u && "Align can't be 0.")((Align != 0u && "Align can't be 0.") ? static_cast<
void> (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 719, __PRETTY_FUNCTION__))
;
720 Skew %= Align;
721 return (Value - Skew) / Align * Align + Skew;
722}
723
724/// Returns the offset to the next integer (mod 2**64) that is greater than
725/// or equal to \p Value and is a multiple of \p Align. \p Align must be
726/// non-zero.
727inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
728 return alignTo(Value, Align) - Value;
729}
730
731/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
732/// Requires 0 < B <= 32.
733template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) {
734 static_assert(B > 0, "Bit width can't be 0.");
735 static_assert(B <= 32, "Bit width out of range.");
736 return int32_t(X << (32 - B)) >> (32 - B);
737}
738
739/// Sign-extend the number in the bottom B bits of X to a 32-bit integer.
740/// Requires 0 < B < 32.
741inline int32_t SignExtend32(uint32_t X, unsigned B) {
742 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 742, __PRETTY_FUNCTION__))
;
743 assert(B <= 32 && "Bit width out of range.")((B <= 32 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 743, __PRETTY_FUNCTION__))
;
744 return int32_t(X << (32 - B)) >> (32 - B);
745}
746
747/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
748/// Requires 0 < B < 64.
749template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) {
750 static_assert(B > 0, "Bit width can't be 0.");
751 static_assert(B <= 64, "Bit width out of range.");
752 return int64_t(x << (64 - B)) >> (64 - B);
753}
754
755/// Sign-extend the number in the bottom B bits of X to a 64-bit integer.
756/// Requires 0 < B < 64.
757inline int64_t SignExtend64(uint64_t X, unsigned B) {
758 assert(B > 0 && "Bit width can't be 0.")((B > 0 && "Bit width can't be 0.") ? static_cast<
void> (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 758, __PRETTY_FUNCTION__))
;
759 assert(B <= 64 && "Bit width out of range.")((B <= 64 && "Bit width out of range.") ? static_cast
<void> (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\""
, "/build/llvm-toolchain-snapshot-8~svn345461/include/llvm/Support/MathExtras.h"
, 759, __PRETTY_FUNCTION__))
;
760 return int64_t(X << (64 - B)) >> (64 - B);
761}
762
763/// Subtract two unsigned integers, X and Y, of type T and return the absolute
764/// value of the result.
765template <typename T>
766typename std::enable_if<std::is_unsigned<T>::value, T>::type
767AbsoluteDifference(T X, T Y) {
768 return std::max(X, Y) - std::min(X, Y);
769}
770
771/// Add two unsigned integers, X and Y, of type T. Clamp the result to the
772/// maximum representable value of T on overflow. ResultOverflowed indicates if
773/// the result is larger than the maximum representable value of type T.
774template <typename T>
775typename std::enable_if<std::is_unsigned<T>::value, T>::type
776SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) {
777 bool Dummy;
778 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
779 // Hacker's Delight, p. 29
780 T Z = X + Y;
781 Overflowed = (Z < X || Z < Y);
782 if (Overflowed)
783 return std::numeric_limits<T>::max();
784 else
785 return Z;
786}
787
788/// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the
789/// maximum representable value of T on overflow. ResultOverflowed indicates if
790/// the result is larger than the maximum representable value of type T.
791template <typename T>
792typename std::enable_if<std::is_unsigned<T>::value, T>::type
793SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) {
794 bool Dummy;
795 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
796
797 // Hacker's Delight, p. 30 has a different algorithm, but we don't use that
798 // because it fails for uint16_t (where multiplication can have undefined
799 // behavior due to promotion to int), and requires a division in addition
800 // to the multiplication.
801
802 Overflowed = false;
803
804 // Log2(Z) would be either Log2Z or Log2Z + 1.
805 // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z
806 // will necessarily be less than Log2Max as desired.
807 int Log2Z = Log2_64(X) + Log2_64(Y);
808 const T Max = std::numeric_limits<T>::max();
809 int Log2Max = Log2_64(Max);
810 if (Log2Z < Log2Max) {
811 return X * Y;
812 }
813 if (Log2Z > Log2Max) {
814 Overflowed = true;
815 return Max;
816 }
817
818 // We're going to use the top bit, and maybe overflow one
819 // bit past it. Multiply all but the bottom bit then add
820 // that on at the end.
821 T Z = (X >> 1) * Y;
822 if (Z & ~(Max >> 1)) {
823 Overflowed = true;
824 return Max;
825 }
826 Z <<= 1;
827 if (X & 1)
828 return SaturatingAdd(Z, Y, ResultOverflowed);
829
830 return Z;
831}
832
833/// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to
834/// the product. Clamp the result to the maximum representable value of T on
835/// overflow. ResultOverflowed indicates if the result is larger than the
836/// maximum representable value of type T.
837template <typename T>
838typename std::enable_if<std::is_unsigned<T>::value, T>::type
839SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) {
840 bool Dummy;
841 bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy;
842
843 T Product = SaturatingMultiply(X, Y, &Overflowed);
844 if (Overflowed)
845 return Product;
846
847 return SaturatingAdd(A, Product, &Overflowed);
848}
849
850/// Use this rather than HUGE_VALF; the latter causes warnings on MSVC.
851extern const float huge_valf;
852} // End llvm namespace
853
854#endif