LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(ISD::LOAD, MVT::f32, Promote);
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
80 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
83 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
86 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
89 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
92 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
95 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
98 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
101 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
119 setOperationAction(ISD::LOAD, MVT::i64, Promote);
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
122 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
125 setOperationAction(ISD::LOAD, MVT::f64, Promote);
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
128 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
131 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
134 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
137 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
140 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
143 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
146 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
155 setOperationAction(ISD::LOAD, MVT::i128, Promote);
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
162 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
165 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
168 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
171 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
174 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
177 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
180 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
242 setOperationAction(ISD::STORE, MVT::f32, Promote);
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
245 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
248 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
251 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
254 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
257 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
260 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
263 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
266 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
269 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
272 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
275 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
278 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
281 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
284 setOperationAction(ISD::STORE, MVT::i64, Promote);
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
287 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
290 setOperationAction(ISD::STORE, MVT::f64, Promote);
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
293 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
296 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
299 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
302 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
305 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
308 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
311 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
314 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
317 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
320 setOperationAction(ISD::STORE, MVT::i128, Promote);
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
343 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
345 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
347
348 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
349 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
351
352 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
353 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
355
356 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
357
358 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
365
366 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
367 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
371
372 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
373 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
375
376 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
377 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
379
380 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
381 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
383
384 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
385 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
387
388 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
389 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
391 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
395
396 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
397 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
398
399 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
400
401 // For R600, this is totally unsupported, just custom lower to produce an
402 // error.
403 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
404
405 // Library functions. These default to Expand, but we have instructions
406 // for them.
407 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
408 ISD::FROUNDEVEN, ISD::FTRUNC},
409 {MVT::f16, MVT::f32}, Legal);
410 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
411
412 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
413 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
414 setOperationAction({ISD::LROUND, ISD::LLROUND},
415 {MVT::f16, MVT::f32, MVT::f64}, Expand);
416
418 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
419 Custom);
420
421 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
422
423 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
424
425 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
426 Expand);
427
428 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
429
430 if (Subtarget->has16BitInsts()) {
431 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
432 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
433 } else {
434 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
435 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
436 }
437
438 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
439 Custom);
440
441 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
442 if (Subtarget->has16BitInsts()) {
444 }
445
446 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
447 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
448 // default unless marked custom/legal.
450 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
451 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
452 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
453 MVT::v16f64},
454 Custom);
455
456 if (isTypeLegal(MVT::f16))
458 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
459 Custom);
460
461 // Expand to fneg + fadd.
463
465 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
466 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
467 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
468 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
469 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
470 Custom);
471
474 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
475 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
476 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
477 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
478 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
479 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
480 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
481 Custom);
482
483 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
484 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
485
486 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
487 for (MVT VT : ScalarIntVTs) {
488 // These should use [SU]DIVREM, so set them to expand
490 Expand);
491
492 // GPU does not have divrem function for signed or unsigned.
494
495 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
497
499
500 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
502 }
503
504 // The hardware supports 32-bit FSHR, but not FSHL.
506
507 // The hardware supports 32-bit ROTR, but not ROTL.
508 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
510
512
516 MVT::i64, Custom);
518
520 Legal);
521
524 MVT::i64, Custom);
525
526 for (auto VT : {MVT::i8, MVT::i16})
528
529 static const MVT::SimpleValueType VectorIntTypes[] = {
530 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
531 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
532
533 for (MVT VT : VectorIntTypes) {
534 // Expand the following operations for the current type by default.
546 ISD::SETCC, ISD::ADDRSPACECAST},
547 VT, Expand);
548 }
549
550 static const MVT::SimpleValueType FloatVectorTypes[] = {
551 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
552 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
553
554 for (MVT VT : FloatVectorTypes) {
556 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
557 ISD::FADD, ISD::FCEIL, ISD::FCOS,
558 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
559 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
560 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
561 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
562 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
563 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
564 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
566 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
567 VT, Expand);
568 }
569
570 // This causes using an unrolled select operation rather than expansion with
571 // bit operations. This is in general better, but the alternative using BFI
572 // instructions may be better if the select sources are SGPRs.
574 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
575
577 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
581
583 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
584
586 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
587
589 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
590
592 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
593
595 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
596
598 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
599
601 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
602
604 setJumpIsExpensive(true);
605
608
610
611 // We want to find all load dependencies for long chains of stores to enable
612 // merging into very wide vectors. The problem is with vectors with > 4
613 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
614 // vectors are a legal type, even though we have to split the loads
615 // usually. When we can more precisely specify load legality per address
616 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
617 // smarter so that they can figure out what to do in 2 iterations without all
618 // N > 4 stores on the same chain.
620
621 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
622 // about these during lowering.
623 MaxStoresPerMemcpy = 0xffffffff;
624 MaxStoresPerMemmove = 0xffffffff;
625 MaxStoresPerMemset = 0xffffffff;
626
627 // The expansion for 64-bit division is enormous.
629 addBypassSlowDiv(64, 32);
630
631 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
637 ISD::STORE, ISD::FADD,
638 ISD::FSUB, ISD::FNEG,
639 ISD::FABS, ISD::AssertZext,
641
645}
646
648 if (getTargetMachine().Options.NoSignedZerosFPMath)
649 return true;
650
651 const auto Flags = Op.getNode()->getFlags();
652 if (Flags.hasNoSignedZeros())
653 return true;
654
655 return false;
656}
657
658//===----------------------------------------------------------------------===//
659// Target Information
660//===----------------------------------------------------------------------===//
661
663static bool fnegFoldsIntoOpcode(unsigned Opc) {
664 switch (Opc) {
665 case ISD::FADD:
666 case ISD::FSUB:
667 case ISD::FMUL:
668 case ISD::FMA:
669 case ISD::FMAD:
670 case ISD::FMINNUM:
671 case ISD::FMAXNUM:
672 case ISD::FMINNUM_IEEE:
673 case ISD::FMAXNUM_IEEE:
674 case ISD::FMINIMUM:
675 case ISD::FMAXIMUM:
676 case ISD::FMINIMUMNUM:
677 case ISD::FMAXIMUMNUM:
678 case ISD::SELECT:
679 case ISD::FSIN:
680 case ISD::FTRUNC:
681 case ISD::FRINT:
682 case ISD::FNEARBYINT:
683 case ISD::FROUNDEVEN:
685 case AMDGPUISD::RCP:
686 case AMDGPUISD::RCP_LEGACY:
687 case AMDGPUISD::RCP_IFLAG:
688 case AMDGPUISD::SIN_HW:
689 case AMDGPUISD::FMUL_LEGACY:
690 case AMDGPUISD::FMIN_LEGACY:
691 case AMDGPUISD::FMAX_LEGACY:
692 case AMDGPUISD::FMED3:
693 // TODO: handle llvm.amdgcn.fma.legacy
694 return true;
695 case ISD::BITCAST:
696 llvm_unreachable("bitcast is special cased");
697 default:
698 return false;
699 }
700}
701
702static bool fnegFoldsIntoOp(const SDNode *N) {
703 unsigned Opc = N->getOpcode();
704 if (Opc == ISD::BITCAST) {
705 // TODO: Is there a benefit to checking the conditions performFNegCombine
706 // does? We don't for the other cases.
707 SDValue BCSrc = N->getOperand(0);
708 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
709 return BCSrc.getNumOperands() == 2 &&
710 BCSrc.getOperand(1).getValueSizeInBits() == 32;
711 }
712
713 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
714 }
715
716 return fnegFoldsIntoOpcode(Opc);
717}
718
719/// \p returns true if the operation will definitely need to use a 64-bit
720/// encoding, and thus will use a VOP3 encoding regardless of the source
721/// modifiers.
723static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
724 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
725 VT == MVT::f64;
726}
727
728/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
729/// type for ISD::SELECT.
731static bool selectSupportsSourceMods(const SDNode *N) {
732 // TODO: Only applies if select will be vector
733 return N->getValueType(0) == MVT::f32;
734}
735
736// Most FP instructions support source modifiers, but this could be refined
737// slightly.
739static bool hasSourceMods(const SDNode *N) {
740 if (isa<MemSDNode>(N))
741 return false;
742
743 switch (N->getOpcode()) {
744 case ISD::CopyToReg:
745 case ISD::FDIV:
746 case ISD::FREM:
747 case ISD::INLINEASM:
748 case ISD::INLINEASM_BR:
749 case AMDGPUISD::DIV_SCALE:
751
752 // TODO: Should really be looking at the users of the bitcast. These are
753 // problematic because bitcasts are used to legalize all stores to integer
754 // types.
755 case ISD::BITCAST:
756 return false;
758 switch (N->getConstantOperandVal(0)) {
759 case Intrinsic::amdgcn_interp_p1:
760 case Intrinsic::amdgcn_interp_p2:
761 case Intrinsic::amdgcn_interp_mov:
762 case Intrinsic::amdgcn_interp_p1_f16:
763 case Intrinsic::amdgcn_interp_p2_f16:
764 return false;
765 default:
766 return true;
767 }
768 }
769 case ISD::SELECT:
771 default:
772 return true;
773 }
774}
775
777 unsigned CostThreshold) {
778 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
779 // it is truly free to use a source modifier in all cases. If there are
780 // multiple users but for each one will necessitate using VOP3, there will be
781 // a code size increase. Try to avoid increasing code size unless we know it
782 // will save on the instruction count.
783 unsigned NumMayIncreaseSize = 0;
784 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
785
786 assert(!N->use_empty());
787
788 // XXX - Should this limit number of uses to check?
789 for (const SDNode *U : N->users()) {
790 if (!hasSourceMods(U))
791 return false;
792
793 if (!opMustUseVOP3Encoding(U, VT)) {
794 if (++NumMayIncreaseSize > CostThreshold)
795 return false;
796 }
797 }
798
799 return true;
800}
801
803 ISD::NodeType ExtendKind) const {
804 assert(!VT.isVector() && "only scalar expected");
805
806 // Round to the next multiple of 32-bits.
807 unsigned Size = VT.getSizeInBits();
808 if (Size <= 32)
809 return MVT::i32;
810 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
811}
812
814 return 32;
815}
816
818 return true;
819}
820
821// The backend supports 32 and 64 bit floating point immediates.
822// FIXME: Why are we reporting vectors of FP immediates as legal?
824 bool ForCodeSize) const {
825 EVT ScalarVT = VT.getScalarType();
826 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
827 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
828}
829
830// We don't want to shrink f64 / f32 constants.
832 EVT ScalarVT = VT.getScalarType();
833 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
834}
835
837 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
838 std::optional<unsigned> ByteOffset) const {
839 // TODO: This may be worth removing. Check regression tests for diffs.
840 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
841 return false;
842
843 unsigned NewSize = NewVT.getStoreSizeInBits();
844
845 // If we are reducing to a 32-bit load or a smaller multi-dword load,
846 // this is always better.
847 if (NewSize >= 32)
848 return true;
849
850 EVT OldVT = N->getValueType(0);
851 unsigned OldSize = OldVT.getStoreSizeInBits();
852
854 unsigned AS = MN->getAddressSpace();
855 // Do not shrink an aligned scalar load to sub-dword.
856 // Scalar engine cannot do sub-dword loads.
857 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
858 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
862 MN->isInvariant())) &&
864 return false;
865
866 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
867 // extloads, so doing one requires using a buffer_load. In cases where we
868 // still couldn't use a scalar load, using the wider load shouldn't really
869 // hurt anything.
870
871 // If the old size already had to be an extload, there's no harm in continuing
872 // to reduce the width.
873 return (OldSize < 32);
874}
875
877 const SelectionDAG &DAG,
878 const MachineMemOperand &MMO) const {
879
880 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
881
882 if (LoadTy.getScalarType() == MVT::i32)
883 return false;
884
885 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
886 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
887
888 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
889 return false;
890
891 unsigned Fast = 0;
893 CastTy, MMO, &Fast) &&
894 Fast;
895}
896
897// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
898// profitable with the expansion for 64-bit since it's generally good to
899// speculate things.
901 return true;
902}
903
905 return true;
906}
907
909 switch (N->getOpcode()) {
910 case ISD::EntryToken:
911 case ISD::TokenFactor:
912 return true;
914 unsigned IntrID = N->getConstantOperandVal(0);
916 }
918 unsigned IntrID = N->getConstantOperandVal(1);
920 }
921 case ISD::LOAD:
922 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
924 return true;
925 return false;
926 case AMDGPUISD::SETCC: // ballot-style instruction
927 return true;
928 }
929 return false;
930}
931
933 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
934 NegatibleCost &Cost, unsigned Depth) const {
935
936 switch (Op.getOpcode()) {
937 case ISD::FMA:
938 case ISD::FMAD: {
939 // Negating a fma is not free if it has users without source mods.
940 if (!allUsesHaveSourceMods(Op.getNode()))
941 return SDValue();
942 break;
943 }
944 case AMDGPUISD::RCP: {
945 SDValue Src = Op.getOperand(0);
946 EVT VT = Op.getValueType();
947 SDLoc SL(Op);
948
949 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
950 ForCodeSize, Cost, Depth + 1);
951 if (NegSrc)
952 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
953 return SDValue();
954 }
955 default:
956 break;
957 }
958
959 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
960 ForCodeSize, Cost, Depth);
961}
962
963//===---------------------------------------------------------------------===//
964// Target Properties
965//===---------------------------------------------------------------------===//
966
969
970 // Packed operations do not have a fabs modifier.
971 return VT == MVT::f32 || VT == MVT::f64 ||
972 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
973}
974
977 // Report this based on the end legalized type.
978 VT = VT.getScalarType();
979 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
980}
981
983 unsigned NumElem,
984 unsigned AS) const {
985 return true;
986}
987
989 // There are few operations which truly have vector input operands. Any vector
990 // operation is going to involve operations on each component, and a
991 // build_vector will be a copy per element, so it always makes sense to use a
992 // build_vector input in place of the extracted element to avoid a copy into a
993 // super register.
994 //
995 // We should probably only do this if all users are extracts only, but this
996 // should be the common case.
997 return true;
998}
999
1001 // Truncate is just accessing a subregister.
1002
1003 unsigned SrcSize = Source.getSizeInBits();
1004 unsigned DestSize = Dest.getSizeInBits();
1005
1006 return DestSize < SrcSize && DestSize % 32 == 0 ;
1007}
1008
1010 // Truncate is just accessing a subregister.
1011
1012 unsigned SrcSize = Source->getScalarSizeInBits();
1013 unsigned DestSize = Dest->getScalarSizeInBits();
1014
1015 if (DestSize== 16 && Subtarget->has16BitInsts())
1016 return SrcSize >= 32;
1017
1018 return DestSize < SrcSize && DestSize % 32 == 0;
1019}
1020
1022 unsigned SrcSize = Src->getScalarSizeInBits();
1023 unsigned DestSize = Dest->getScalarSizeInBits();
1024
1025 if (SrcSize == 16 && Subtarget->has16BitInsts())
1026 return DestSize >= 32;
1027
1028 return SrcSize == 32 && DestSize == 64;
1029}
1030
1032 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1033 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1034 // this will enable reducing 64-bit operations the 32-bit, which is always
1035 // good.
1036
1037 if (Src == MVT::i16)
1038 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1039
1040 return Src == MVT::i32 && Dest == MVT::i64;
1041}
1042
1044 EVT DestVT) const {
1045 switch (N->getOpcode()) {
1046 case ISD::ADD:
1047 case ISD::SUB:
1048 case ISD::SHL:
1049 case ISD::SRL:
1050 case ISD::SRA:
1051 case ISD::AND:
1052 case ISD::OR:
1053 case ISD::XOR:
1054 case ISD::MUL:
1055 case ISD::SETCC:
1056 case ISD::SELECT:
1057 case ISD::SMIN:
1058 case ISD::SMAX:
1059 case ISD::UMIN:
1060 case ISD::UMAX:
1061 if (Subtarget->has16BitInsts() &&
1062 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1063 // Don't narrow back down to i16 if promoted to i32 already.
1064 if (!N->isDivergent() && DestVT.isInteger() &&
1065 DestVT.getScalarSizeInBits() > 1 &&
1066 DestVT.getScalarSizeInBits() <= 16 &&
1067 SrcVT.getScalarSizeInBits() > 16) {
1068 return false;
1069 }
1070 }
1071 return true;
1072 default:
1073 break;
1074 }
1075
1076 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1077 // limited number of native 64-bit operations. Shrinking an operation to fit
1078 // in a single 32-bit register should always be helpful. As currently used,
1079 // this is much less general than the name suggests, and is only used in
1080 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1081 // not profitable, and may actually be harmful.
1082 if (isa<LoadSDNode>(N))
1083 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1084
1085 return true;
1086}
1087
1089 const SDNode* N, CombineLevel Level) const {
1090 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1091 N->getOpcode() == ISD::SRL) &&
1092 "Expected shift op");
1093
1094 SDValue ShiftLHS = N->getOperand(0);
1095 if (!ShiftLHS->hasOneUse())
1096 return false;
1097
1098 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1099 !ShiftLHS.getOperand(0)->hasOneUse())
1100 return false;
1101
1102 // Always commute pre-type legalization and right shifts.
1103 // We're looking for shl(or(x,y),z) patterns.
1105 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1106 return true;
1107
1108 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1109 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1110 (N->user_begin()->getOpcode() == ISD::SRA ||
1111 N->user_begin()->getOpcode() == ISD::SRL))
1112 return false;
1113
1114 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1115 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1116 if (LHS.getOpcode() != ISD::SHL)
1117 return false;
1118 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1119 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1120 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1121 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1122 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1123 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1124 };
1125 SDValue LHS = N->getOperand(0).getOperand(0);
1126 SDValue RHS = N->getOperand(0).getOperand(1);
1127 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1128}
1129
1130//===---------------------------------------------------------------------===//
1131// TargetLowering Callbacks
1132//===---------------------------------------------------------------------===//
1133
1135 bool IsVarArg) {
1136 switch (CC) {
1144 return CC_AMDGPU;
1147 return CC_AMDGPU_CS_CHAIN;
1148 case CallingConv::C:
1149 case CallingConv::Fast:
1150 case CallingConv::Cold:
1151 return CC_AMDGPU_Func;
1154 return CC_SI_Gfx;
1157 default:
1158 reportFatalUsageError("unsupported calling convention for call");
1159 }
1160}
1161
1163 bool IsVarArg) {
1164 switch (CC) {
1167 llvm_unreachable("kernels should not be handled here");
1177 return RetCC_SI_Shader;
1180 return RetCC_SI_Gfx;
1181 case CallingConv::C:
1182 case CallingConv::Fast:
1183 case CallingConv::Cold:
1184 return RetCC_AMDGPU_Func;
1185 default:
1186 reportFatalUsageError("unsupported calling convention");
1187 }
1188}
1189
1190/// The SelectionDAGBuilder will automatically promote function arguments
1191/// with illegal types. However, this does not work for the AMDGPU targets
1192/// since the function arguments are stored in memory as these illegal types.
1193/// In order to handle this properly we need to get the original types sizes
1194/// from the LLVM IR Function and fixup the ISD:InputArg values before
1195/// passing them to AnalyzeFormalArguments()
1196
1197/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1198/// input values across multiple registers. Each item in the Ins array
1199/// represents a single value that will be stored in registers. Ins[x].VT is
1200/// the value type of the value that will be stored in the register, so
1201/// whatever SDNode we lower the argument to needs to be this type.
1202///
1203/// In order to correctly lower the arguments we need to know the size of each
1204/// argument. Since Ins[x].VT gives us the size of the register that will
1205/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1206/// for the original function argument so that we can deduce the correct memory
1207/// type to use for Ins[x]. In most cases the correct memory type will be
1208/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1209/// we have a kernel argument of type v8i8, this argument will be split into
1210/// 8 parts and each part will be represented by its own item in the Ins array.
1211/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1212/// the argument before it was split. From this, we deduce that the memory type
1213/// for each individual part is i8. We pass the memory type as LocVT to the
1214/// calling convention analysis function and the register type (Ins[x].VT) as
1215/// the ValVT.
1217 CCState &State,
1218 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1219 const MachineFunction &MF = State.getMachineFunction();
1220 const Function &Fn = MF.getFunction();
1221 LLVMContext &Ctx = Fn.getContext();
1222 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1223 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1225
1226 Align MaxAlign = Align(1);
1227 uint64_t ExplicitArgOffset = 0;
1228 const DataLayout &DL = Fn.getDataLayout();
1229
1230 unsigned InIndex = 0;
1231
1232 for (const Argument &Arg : Fn.args()) {
1233 const bool IsByRef = Arg.hasByRefAttr();
1234 Type *BaseArgTy = Arg.getType();
1235 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1236 Align Alignment = DL.getValueOrABITypeAlignment(
1237 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1238 MaxAlign = std::max(Alignment, MaxAlign);
1239 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1240
1241 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1242 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1243
1244 // We're basically throwing away everything passed into us and starting over
1245 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1246 // to us as computed in Ins.
1247 //
1248 // We also need to figure out what type legalization is trying to do to get
1249 // the correct memory offsets.
1250
1251 SmallVector<EVT, 16> ValueVTs;
1253 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1254 &Offsets, ArgOffset);
1255
1256 for (unsigned Value = 0, NumValues = ValueVTs.size();
1257 Value != NumValues; ++Value) {
1258 uint64_t BasePartOffset = Offsets[Value];
1259
1260 EVT ArgVT = ValueVTs[Value];
1261 EVT MemVT = ArgVT;
1262 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1263 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1264
1265 if (NumRegs == 1) {
1266 // This argument is not split, so the IR type is the memory type.
1267 if (ArgVT.isExtended()) {
1268 // We have an extended type, like i24, so we should just use the
1269 // register type.
1270 MemVT = RegisterVT;
1271 } else {
1272 MemVT = ArgVT;
1273 }
1274 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1275 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1276 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1277 // We have a vector value which has been split into a vector with
1278 // the same scalar type, but fewer elements. This should handle
1279 // all the floating-point vector types.
1280 MemVT = RegisterVT;
1281 } else if (ArgVT.isVector() &&
1282 ArgVT.getVectorNumElements() == NumRegs) {
1283 // This arg has been split so that each element is stored in a separate
1284 // register.
1285 MemVT = ArgVT.getScalarType();
1286 } else if (ArgVT.isExtended()) {
1287 // We have an extended type, like i65.
1288 MemVT = RegisterVT;
1289 } else {
1290 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1291 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1292 if (RegisterVT.isInteger()) {
1293 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1294 } else if (RegisterVT.isVector()) {
1295 assert(!RegisterVT.getScalarType().isFloatingPoint());
1296 unsigned NumElements = RegisterVT.getVectorNumElements();
1297 assert(MemoryBits % NumElements == 0);
1298 // This vector type has been split into another vector type with
1299 // a different elements size.
1300 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1301 MemoryBits / NumElements);
1302 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1303 } else {
1304 llvm_unreachable("cannot deduce memory type.");
1305 }
1306 }
1307
1308 // Convert one element vectors to scalar.
1309 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1310 MemVT = MemVT.getScalarType();
1311
1312 // Round up vec3/vec5 argument.
1313 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1314 MemVT = MemVT.getPow2VectorType(State.getContext());
1315 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1316 MemVT = MemVT.getRoundIntegerType(State.getContext());
1317 }
1318
1319 unsigned PartOffset = 0;
1320 for (unsigned i = 0; i != NumRegs; ++i) {
1321 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1322 BasePartOffset + PartOffset,
1323 MemVT.getSimpleVT(),
1325 PartOffset += MemVT.getStoreSize();
1326 }
1327 }
1328 }
1329}
1330
1332 SDValue Chain, CallingConv::ID CallConv,
1333 bool isVarArg,
1335 const SmallVectorImpl<SDValue> &OutVals,
1336 const SDLoc &DL, SelectionDAG &DAG) const {
1337 // FIXME: Fails for r600 tests
1338 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1339 // "wave terminate should not have return values");
1340 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1341}
1342
1343//===---------------------------------------------------------------------===//
1344// Target specific lowering
1345//===---------------------------------------------------------------------===//
1346
1347/// Selects the correct CCAssignFn for a given CallingConvention value.
1352
1357
1359 SelectionDAG &DAG,
1360 MachineFrameInfo &MFI,
1361 int ClobberedFI) const {
1362 SmallVector<SDValue, 8> ArgChains;
1363 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1364 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1365
1366 // Include the original chain at the beginning of the list. When this is
1367 // used by target LowerCall hooks, this helps legalize find the
1368 // CALLSEQ_BEGIN node.
1369 ArgChains.push_back(Chain);
1370
1371 // Add a chain value for each stack argument corresponding
1372 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1373 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1374 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1375 if (FI->getIndex() < 0) {
1376 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1377 int64_t InLastByte = InFirstByte;
1378 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1379
1380 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1381 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1382 ArgChains.push_back(SDValue(L, 1));
1383 }
1384 }
1385 }
1386 }
1387
1388 // Build a tokenfactor for all the chains.
1389 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1390}
1391
1394 StringRef Reason) const {
1395 SDValue Callee = CLI.Callee;
1396 SelectionDAG &DAG = CLI.DAG;
1397
1398 const Function &Fn = DAG.getMachineFunction().getFunction();
1399
1400 StringRef FuncName("<unknown>");
1401
1403 FuncName = G->getSymbol();
1404 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1405 FuncName = G->getGlobal()->getName();
1406
1407 DAG.getContext()->diagnose(
1408 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1409
1410 if (!CLI.IsTailCall) {
1411 for (ISD::InputArg &Arg : CLI.Ins)
1412 InVals.push_back(DAG.getPOISON(Arg.VT));
1413 }
1414
1415 return DAG.getEntryNode();
1416}
1417
1419 SmallVectorImpl<SDValue> &InVals) const {
1420 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1421}
1422
1424 SelectionDAG &DAG) const {
1425 const Function &Fn = DAG.getMachineFunction().getFunction();
1426
1428 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1429 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1430 return DAG.getMergeValues(Ops, SDLoc());
1431}
1432
1434 SelectionDAG &DAG) const {
1435 switch (Op.getOpcode()) {
1436 default:
1437 Op->print(errs(), &DAG);
1438 llvm_unreachable("Custom lowering code for this "
1439 "instruction is not implemented yet!");
1440 break;
1442 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1444 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1445 case ISD::SDIVREM:
1446 return LowerSDIVREM(Op, DAG);
1447 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1448 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1449 case ISD::FRINT: return LowerFRINT(Op, DAG);
1450 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1451 case ISD::FROUNDEVEN:
1452 return LowerFROUNDEVEN(Op, DAG);
1453 case ISD::FROUND: return LowerFROUND(Op, DAG);
1454 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1455 case ISD::FLOG2:
1456 return LowerFLOG2(Op, DAG);
1457 case ISD::FLOG:
1458 case ISD::FLOG10:
1459 return LowerFLOGCommon(Op, DAG);
1460 case ISD::FEXP:
1461 case ISD::FEXP10:
1462 return lowerFEXP(Op, DAG);
1463 case ISD::FEXP2:
1464 return lowerFEXP2(Op, DAG);
1465 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1466 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1467 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1468 case ISD::FP_TO_SINT:
1469 case ISD::FP_TO_UINT:
1470 return LowerFP_TO_INT(Op, DAG);
1471 case ISD::CTTZ:
1473 case ISD::CTLZ:
1475 return LowerCTLZ_CTTZ(Op, DAG);
1476 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1477 }
1478 return Op;
1479}
1480
1483 SelectionDAG &DAG) const {
1484 switch (N->getOpcode()) {
1486 // Different parts of legalization seem to interpret which type of
1487 // sign_extend_inreg is the one to check for custom lowering. The extended
1488 // from type is what really matters, but some places check for custom
1489 // lowering of the result type. This results in trying to use
1490 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1491 // nothing here and let the illegal result integer be handled normally.
1492 return;
1493 case ISD::FLOG2:
1494 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1495 Results.push_back(Lowered);
1496 return;
1497 case ISD::FLOG:
1498 case ISD::FLOG10:
1499 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1500 Results.push_back(Lowered);
1501 return;
1502 case ISD::FEXP2:
1503 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1504 Results.push_back(Lowered);
1505 return;
1506 case ISD::FEXP:
1507 case ISD::FEXP10:
1508 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1509 Results.push_back(Lowered);
1510 return;
1511 case ISD::CTLZ:
1513 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1514 Results.push_back(Lowered);
1515 return;
1516 default:
1517 return;
1518 }
1519}
1520
1522 SDValue Op,
1523 SelectionDAG &DAG) const {
1524
1525 const DataLayout &DL = DAG.getDataLayout();
1527 const GlobalValue *GV = G->getGlobal();
1528
1529 if (!MFI->isModuleEntryFunction()) {
1530 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1531 if (std::optional<uint32_t> Address =
1533 if (IsNamedBarrier) {
1534 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1535 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1536 }
1537 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1538 } else if (IsNamedBarrier) {
1539 llvm_unreachable("named barrier should have an assigned address");
1540 }
1541 }
1542
1543 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1544 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1545 if (!MFI->isModuleEntryFunction() &&
1546 GV->getName() != "llvm.amdgcn.module.lds" &&
1548 SDLoc DL(Op);
1549 const Function &Fn = DAG.getMachineFunction().getFunction();
1551 Fn, "local memory global used by non-kernel function",
1552 DL.getDebugLoc(), DS_Warning));
1553
1554 // We currently don't have a way to correctly allocate LDS objects that
1555 // aren't directly associated with a kernel. We do force inlining of
1556 // functions that use local objects. However, if these dead functions are
1557 // not eliminated, we don't want a compile time error. Just emit a warning
1558 // and a trap, since there should be no callable path here.
1559 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1560 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1561 Trap, DAG.getRoot());
1562 DAG.setRoot(OutputChain);
1563 return DAG.getPOISON(Op.getValueType());
1564 }
1565
1566 // XXX: What does the value of G->getOffset() mean?
1567 assert(G->getOffset() == 0 &&
1568 "Do not know what to do with an non-zero offset");
1569
1570 // TODO: We could emit code to handle the initialization somewhere.
1571 // We ignore the initializer for now and legalize it to allow selection.
1572 // The initializer will anyway get errored out during assembly emission.
1573 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1574 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1575 }
1576 return SDValue();
1577}
1578
1580 SelectionDAG &DAG) const {
1582 SDLoc SL(Op);
1583
1584 EVT VT = Op.getValueType();
1585 if (VT.getVectorElementType().getSizeInBits() < 32) {
1586 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1587 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1588 unsigned NewNumElt = OpBitSize / 32;
1589 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1591 MVT::i32, NewNumElt);
1592 for (const SDUse &U : Op->ops()) {
1593 SDValue In = U.get();
1594 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1595 if (NewNumElt > 1)
1596 DAG.ExtractVectorElements(NewIn, Args);
1597 else
1598 Args.push_back(NewIn);
1599 }
1600
1601 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1602 NewNumElt * Op.getNumOperands());
1603 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1604 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1605 }
1606 }
1607
1608 for (const SDUse &U : Op->ops())
1609 DAG.ExtractVectorElements(U.get(), Args);
1610
1611 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1612}
1613
1615 SelectionDAG &DAG) const {
1616 SDLoc SL(Op);
1618 unsigned Start = Op.getConstantOperandVal(1);
1619 EVT VT = Op.getValueType();
1620 EVT SrcVT = Op.getOperand(0).getValueType();
1621
1622 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1623 unsigned NumElt = VT.getVectorNumElements();
1624 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1625 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1626
1627 // Extract 32-bit registers at a time.
1628 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1629 EVT NewVT = NumElt == 2
1630 ? MVT::i32
1631 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1632 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1633
1634 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1635 if (NumElt == 2)
1636 Tmp = Args[0];
1637 else
1638 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1639
1640 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1641 }
1642
1643 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1645
1646 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1647}
1648
1649// TODO: Handle fabs too
1651 if (Val.getOpcode() == ISD::FNEG)
1652 return Val.getOperand(0);
1653
1654 return Val;
1655}
1656
1658 if (Val.getOpcode() == ISD::FNEG)
1659 Val = Val.getOperand(0);
1660 if (Val.getOpcode() == ISD::FABS)
1661 Val = Val.getOperand(0);
1662 if (Val.getOpcode() == ISD::FCOPYSIGN)
1663 Val = Val.getOperand(0);
1664 return Val;
1665}
1666
1668 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1669 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1670 SelectionDAG &DAG = DCI.DAG;
1671 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1672 switch (CCOpcode) {
1673 case ISD::SETOEQ:
1674 case ISD::SETONE:
1675 case ISD::SETUNE:
1676 case ISD::SETNE:
1677 case ISD::SETUEQ:
1678 case ISD::SETEQ:
1679 case ISD::SETFALSE:
1680 case ISD::SETFALSE2:
1681 case ISD::SETTRUE:
1682 case ISD::SETTRUE2:
1683 case ISD::SETUO:
1684 case ISD::SETO:
1685 break;
1686 case ISD::SETULE:
1687 case ISD::SETULT: {
1688 if (LHS == True)
1689 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1690 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1691 }
1692 case ISD::SETOLE:
1693 case ISD::SETOLT:
1694 case ISD::SETLE:
1695 case ISD::SETLT: {
1696 // Ordered. Assume ordered for undefined.
1697
1698 // Only do this after legalization to avoid interfering with other combines
1699 // which might occur.
1701 !DCI.isCalledByLegalizer())
1702 return SDValue();
1703
1704 // We need to permute the operands to get the correct NaN behavior. The
1705 // selected operand is the second one based on the failing compare with NaN,
1706 // so permute it based on the compare type the hardware uses.
1707 if (LHS == True)
1708 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1709 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1710 }
1711 case ISD::SETUGE:
1712 case ISD::SETUGT: {
1713 if (LHS == True)
1714 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1715 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1716 }
1717 case ISD::SETGT:
1718 case ISD::SETGE:
1719 case ISD::SETOGE:
1720 case ISD::SETOGT: {
1722 !DCI.isCalledByLegalizer())
1723 return SDValue();
1724
1725 if (LHS == True)
1726 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1727 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1728 }
1729 case ISD::SETCC_INVALID:
1730 llvm_unreachable("Invalid setcc condcode!");
1731 }
1732 return SDValue();
1733}
1734
1735/// Generate Min/Max node
1737 SDValue LHS, SDValue RHS,
1738 SDValue True, SDValue False,
1739 SDValue CC,
1740 DAGCombinerInfo &DCI) const {
1741 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1742 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1743
1744 SelectionDAG &DAG = DCI.DAG;
1745
1746 // If we can't directly match this, try to see if we can fold an fneg to
1747 // match.
1748
1751 SDValue NegTrue = peekFNeg(True);
1752
1753 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1754 // fmin/fmax.
1755 //
1756 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1757 // -> fneg (fmin_legacy lhs, K)
1758 //
1759 // TODO: Use getNegatedExpression
1760 if (LHS == NegTrue && CFalse && CRHS) {
1761 APFloat NegRHS = neg(CRHS->getValueAPF());
1762 if (NegRHS == CFalse->getValueAPF()) {
1763 SDValue Combined =
1764 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1765 if (Combined)
1766 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1767 return SDValue();
1768 }
1769 }
1770
1771 return SDValue();
1772}
1773
1774std::pair<SDValue, SDValue>
1776 SDLoc SL(Op);
1777
1778 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1779
1780 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1781 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1782
1783 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1784 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1785
1786 return std::pair(Lo, Hi);
1787}
1788
1790 SDLoc SL(Op);
1791
1792 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1793 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1794 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1795}
1796
1798 SDLoc SL(Op);
1799
1800 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1801 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1802 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1803}
1804
1805// Split a vector type into two parts. The first part is a power of two vector.
1806// The second part is whatever is left over, and is a scalar if it would
1807// otherwise be a 1-vector.
1808std::pair<EVT, EVT>
1810 EVT LoVT, HiVT;
1811 EVT EltVT = VT.getVectorElementType();
1812 unsigned NumElts = VT.getVectorNumElements();
1813 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1814 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1815 HiVT = NumElts - LoNumElts == 1
1816 ? EltVT
1817 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1818 return std::pair(LoVT, HiVT);
1819}
1820
1821// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1822// scalar.
1823std::pair<SDValue, SDValue>
1825 const EVT &LoVT, const EVT &HiVT,
1826 SelectionDAG &DAG) const {
1827 EVT VT = N.getValueType();
1829 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1830 VT.getVectorNumElements() &&
1831 "More vector elements requested than available!");
1833 DAG.getVectorIdxConstant(0, DL));
1834
1835 unsigned LoNumElts = LoVT.getVectorNumElements();
1836
1837 if (HiVT.isVector()) {
1838 unsigned HiNumElts = HiVT.getVectorNumElements();
1839 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1840 // Avoid creating an extract_subvector with an index that isn't a multiple
1841 // of the result type.
1843 DAG.getConstant(LoNumElts, DL, MVT::i32));
1844 return {Lo, Hi};
1845 }
1846
1848 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1849 /*Count=*/HiNumElts);
1850 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1851 return {Lo, Hi};
1852 }
1853
1855 DAG.getVectorIdxConstant(LoNumElts, DL));
1856 return {Lo, Hi};
1857}
1858
1860 SelectionDAG &DAG) const {
1862 EVT VT = Op.getValueType();
1863 SDLoc SL(Op);
1864
1865
1866 // If this is a 2 element vector, we really want to scalarize and not create
1867 // weird 1 element vectors.
1868 if (VT.getVectorNumElements() == 2) {
1869 SDValue Ops[2];
1870 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1871 return DAG.getMergeValues(Ops, SL);
1872 }
1873
1874 SDValue BasePtr = Load->getBasePtr();
1875 EVT MemVT = Load->getMemoryVT();
1876
1877 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1878
1879 EVT LoVT, HiVT;
1880 EVT LoMemVT, HiMemVT;
1881 SDValue Lo, Hi;
1882
1883 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1884 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1885 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1886
1887 unsigned Size = LoMemVT.getStoreSize();
1888 Align BaseAlign = Load->getAlign();
1889 Align HiAlign = commonAlignment(BaseAlign, Size);
1890
1891 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1892 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1893 BaseAlign, Load->getMemOperand()->getFlags());
1894 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1895 SDValue HiLoad =
1896 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1897 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1898 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1899
1900 SDValue Join;
1901 if (LoVT == HiVT) {
1902 // This is the case that the vector is power of two so was evenly split.
1903 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1904 } else {
1905 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1906 DAG.getVectorIdxConstant(0, SL));
1907 Join = DAG.getNode(
1909 VT, Join, HiLoad,
1911 }
1912
1913 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1914 LoLoad.getValue(1), HiLoad.getValue(1))};
1915
1916 return DAG.getMergeValues(Ops, SL);
1917}
1918
1920 SelectionDAG &DAG) const {
1922 EVT VT = Op.getValueType();
1923 SDValue BasePtr = Load->getBasePtr();
1924 EVT MemVT = Load->getMemoryVT();
1925 SDLoc SL(Op);
1926 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1927 Align BaseAlign = Load->getAlign();
1928 unsigned NumElements = MemVT.getVectorNumElements();
1929
1930 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1931 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1932 if (NumElements != 3 ||
1933 (BaseAlign < Align(8) &&
1934 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1935 return SplitVectorLoad(Op, DAG);
1936
1937 assert(NumElements == 3);
1938
1939 EVT WideVT =
1941 EVT WideMemVT =
1943 SDValue WideLoad = DAG.getExtLoad(
1944 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1945 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1946 return DAG.getMergeValues(
1947 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1948 DAG.getVectorIdxConstant(0, SL)),
1949 WideLoad.getValue(1)},
1950 SL);
1951}
1952
1954 SelectionDAG &DAG) const {
1956 SDValue Val = Store->getValue();
1957 EVT VT = Val.getValueType();
1958
1959 // If this is a 2 element vector, we really want to scalarize and not create
1960 // weird 1 element vectors.
1961 if (VT.getVectorNumElements() == 2)
1962 return scalarizeVectorStore(Store, DAG);
1963
1964 EVT MemVT = Store->getMemoryVT();
1965 SDValue Chain = Store->getChain();
1966 SDValue BasePtr = Store->getBasePtr();
1967 SDLoc SL(Op);
1968
1969 EVT LoVT, HiVT;
1970 EVT LoMemVT, HiMemVT;
1971 SDValue Lo, Hi;
1972
1973 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1974 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1975 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1976
1977 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1978
1979 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1980 Align BaseAlign = Store->getAlign();
1981 unsigned Size = LoMemVT.getStoreSize();
1982 Align HiAlign = commonAlignment(BaseAlign, Size);
1983
1984 SDValue LoStore =
1985 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1986 Store->getMemOperand()->getFlags());
1987 SDValue HiStore =
1988 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1989 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1990
1991 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1992}
1993
1994// This is a shortcut for integer division because we have fast i32<->f32
1995// conversions, and fast f32 reciprocal instructions. The fractional part of a
1996// float is enough to accurately represent up to a 24-bit signed integer.
1998 bool Sign) const {
1999 SDLoc DL(Op);
2000 EVT VT = Op.getValueType();
2001 SDValue LHS = Op.getOperand(0);
2002 SDValue RHS = Op.getOperand(1);
2003 MVT IntVT = MVT::i32;
2004 MVT FltVT = MVT::f32;
2005
2006 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2007 if (LHSSignBits < 9)
2008 return SDValue();
2009
2010 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2011 if (RHSSignBits < 9)
2012 return SDValue();
2013
2014 unsigned BitSize = VT.getSizeInBits();
2015 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2016 unsigned DivBits = BitSize - SignBits;
2017 if (Sign)
2018 ++DivBits;
2019
2022
2023 SDValue jq = DAG.getConstant(1, DL, IntVT);
2024
2025 if (Sign) {
2026 // char|short jq = ia ^ ib;
2027 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2028
2029 // jq = jq >> (bitsize - 2)
2030 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2031 DAG.getConstant(BitSize - 2, DL, VT));
2032
2033 // jq = jq | 0x1
2034 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2035 }
2036
2037 // int ia = (int)LHS;
2038 SDValue ia = LHS;
2039
2040 // int ib, (int)RHS;
2041 SDValue ib = RHS;
2042
2043 // float fa = (float)ia;
2044 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2045
2046 // float fb = (float)ib;
2047 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2048
2049 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2050 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2051
2052 // fq = trunc(fq);
2053 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2054
2055 // float fqneg = -fq;
2056 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2057
2059
2060 bool UseFmadFtz = false;
2061 if (Subtarget->isGCN()) {
2063 UseFmadFtz =
2065 }
2066
2067 // float fr = mad(fqneg, fb, fa);
2068 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2069 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2071 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2072
2073 // int iq = (int)fq;
2074 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2075
2076 // fr = fabs(fr);
2077 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2078
2079 // fb = fabs(fb);
2080 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2081
2082 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2083
2084 // int cv = fr >= fb;
2085 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2086
2087 // jq = (cv ? jq : 0);
2088 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2089
2090 // dst = iq + jq;
2091 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2092
2093 // Rem needs compensation, it's easier to recompute it
2094 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2095 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2096
2097 // Truncate to number of bits this divide really is.
2098 if (Sign) {
2099 SDValue InRegSize
2100 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2101 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2102 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2103 } else {
2104 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2105 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2106 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2107 }
2108
2109 return DAG.getMergeValues({ Div, Rem }, DL);
2110}
2111
2113 SelectionDAG &DAG,
2115 SDLoc DL(Op);
2116 EVT VT = Op.getValueType();
2117
2118 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2119
2120 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2121
2122 SDValue One = DAG.getConstant(1, DL, HalfVT);
2123 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2124
2125 //HiLo split
2126 SDValue LHS_Lo, LHS_Hi;
2127 SDValue LHS = Op.getOperand(0);
2128 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2129
2130 SDValue RHS_Lo, RHS_Hi;
2131 SDValue RHS = Op.getOperand(1);
2132 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2133
2134 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2135 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2136
2137 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2138 LHS_Lo, RHS_Lo);
2139
2140 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2141 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2142
2143 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2144 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2145 return;
2146 }
2147
2148 if (isTypeLegal(MVT::i64)) {
2149 // The algorithm here is based on ideas from "Software Integer Division",
2150 // Tom Rodeheffer, August 2008.
2151
2154
2155 // Compute denominator reciprocal.
2156 unsigned FMAD =
2157 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2160 : (unsigned)AMDGPUISD::FMAD_FTZ;
2161
2162 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2163 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2164 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2165 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2166 Cvt_Lo);
2167 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2168 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2169 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2170 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2171 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2172 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2173 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2174 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2175 Mul1);
2176 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2177 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2178 SDValue Rcp64 = DAG.getBitcast(VT,
2179 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2180
2181 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2182 SDValue One64 = DAG.getConstant(1, DL, VT);
2183 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2184 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2185
2186 // First round of UNR (Unsigned integer Newton-Raphson).
2187 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2188 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2189 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2190 SDValue Mulhi1_Lo, Mulhi1_Hi;
2191 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2192 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2193 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2194 Mulhi1_Lo, Zero1);
2195 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2196 Mulhi1_Hi, Add1_Lo.getValue(1));
2197 SDValue Add1 = DAG.getBitcast(VT,
2198 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2199
2200 // Second round of UNR.
2201 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2202 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2203 SDValue Mulhi2_Lo, Mulhi2_Hi;
2204 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2205 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2206 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2207 Mulhi2_Lo, Zero1);
2208 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2209 Mulhi2_Hi, Add2_Lo.getValue(1));
2210 SDValue Add2 = DAG.getBitcast(VT,
2211 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2212
2213 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2214
2215 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2216
2217 SDValue Mul3_Lo, Mul3_Hi;
2218 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2219 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2220 Mul3_Lo, Zero1);
2221 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2222 Mul3_Hi, Sub1_Lo.getValue(1));
2223 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2224 SDValue Sub1 = DAG.getBitcast(VT,
2225 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2226
2227 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2228 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2229 ISD::SETUGE);
2230 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2231 ISD::SETUGE);
2232 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2233
2234 // TODO: Here and below portions of the code can be enclosed into if/endif.
2235 // Currently control flow is unconditional and we have 4 selects after
2236 // potential endif to substitute PHIs.
2237
2238 // if C3 != 0 ...
2239 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2240 RHS_Lo, Zero1);
2241 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2242 RHS_Hi, Sub1_Lo.getValue(1));
2243 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2244 Zero, Sub2_Lo.getValue(1));
2245 SDValue Sub2 = DAG.getBitcast(VT,
2246 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2247
2248 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2249
2250 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2251 ISD::SETUGE);
2252 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2253 ISD::SETUGE);
2254 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2255
2256 // if (C6 != 0)
2257 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2258
2259 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2260 RHS_Lo, Zero1);
2261 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2262 RHS_Hi, Sub2_Lo.getValue(1));
2263 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2264 Zero, Sub3_Lo.getValue(1));
2265 SDValue Sub3 = DAG.getBitcast(VT,
2266 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2267
2268 // endif C6
2269 // endif C3
2270
2271 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2272 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2273
2274 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2275 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2276
2277 Results.push_back(Div);
2278 Results.push_back(Rem);
2279
2280 return;
2281 }
2282
2283 // r600 expandion.
2284 // Get Speculative values
2285 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2286 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2287
2288 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2289 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2290 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2291
2292 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2293 SDValue DIV_Lo = Zero;
2294
2295 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2296
2297 for (unsigned i = 0; i < halfBitWidth; ++i) {
2298 const unsigned bitPos = halfBitWidth - i - 1;
2299 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2300 // Get value of high bit
2301 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2302 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2303 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2304
2305 // Shift
2306 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2307 // Add LHS high bit
2308 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2309
2310 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2311 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2312
2313 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2314
2315 // Update REM
2316 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2317 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2318 }
2319
2320 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2321 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2322 Results.push_back(DIV);
2323 Results.push_back(REM);
2324}
2325
2327 SelectionDAG &DAG) const {
2328 SDLoc DL(Op);
2329 EVT VT = Op.getValueType();
2330
2331 if (VT == MVT::i64) {
2333 LowerUDIVREM64(Op, DAG, Results);
2334 return DAG.getMergeValues(Results, DL);
2335 }
2336
2337 if (VT == MVT::i32) {
2338 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2339 return Res;
2340 }
2341
2342 SDValue X = Op.getOperand(0);
2343 SDValue Y = Op.getOperand(1);
2344
2345 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2346 // algorithm used here.
2347
2348 // Initial estimate of inv(y).
2349 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2350
2351 // One round of UNR.
2352 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2353 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2354 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2355 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2356
2357 // Quotient/remainder estimate.
2358 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2359 SDValue R =
2360 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2361
2362 // First quotient/remainder refinement.
2363 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2364 SDValue One = DAG.getConstant(1, DL, VT);
2365 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2366 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2367 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2368 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2369 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2370
2371 // Second quotient/remainder refinement.
2372 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2373 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2374 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2375 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2376 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2377
2378 return DAG.getMergeValues({Q, R}, DL);
2379}
2380
2382 SelectionDAG &DAG) const {
2383 SDLoc DL(Op);
2384 EVT VT = Op.getValueType();
2385
2386 SDValue LHS = Op.getOperand(0);
2387 SDValue RHS = Op.getOperand(1);
2388
2389 SDValue Zero = DAG.getConstant(0, DL, VT);
2390 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2391
2392 if (VT == MVT::i32) {
2393 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2394 return Res;
2395 }
2396
2397 if (VT == MVT::i64 &&
2398 DAG.ComputeNumSignBits(LHS) > 32 &&
2399 DAG.ComputeNumSignBits(RHS) > 32) {
2400 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2401
2402 //HiLo split
2403 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2404 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2405 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2406 LHS_Lo, RHS_Lo);
2407 SDValue Res[2] = {
2408 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2409 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2410 };
2411 return DAG.getMergeValues(Res, DL);
2412 }
2413
2414 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2415 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2416 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2417 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2418
2419 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2420 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2421
2422 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2423 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2424
2425 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2426 SDValue Rem = Div.getValue(1);
2427
2428 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2429 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2430
2431 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2432 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2433
2434 SDValue Res[2] = {
2435 Div,
2436 Rem
2437 };
2438 return DAG.getMergeValues(Res, DL);
2439}
2440
2442 SDLoc SL(Op);
2443 SDValue Src = Op.getOperand(0);
2444
2445 // result = trunc(src)
2446 // if (src > 0.0 && src != result)
2447 // result += 1.0
2448
2449 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2450
2451 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2452 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2453
2454 EVT SetCCVT =
2455 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2456
2457 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2458 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2459 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2460
2461 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2462 // TODO: Should this propagate fast-math-flags?
2463 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2464}
2465
2467 SelectionDAG &DAG) {
2468 const unsigned FractBits = 52;
2469 const unsigned ExpBits = 11;
2470
2471 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2472 Hi,
2473 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2474 DAG.getConstant(ExpBits, SL, MVT::i32));
2475 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2476 DAG.getConstant(1023, SL, MVT::i32));
2477
2478 return Exp;
2479}
2480
2482 SDLoc SL(Op);
2483 SDValue Src = Op.getOperand(0);
2484
2485 assert(Op.getValueType() == MVT::f64);
2486
2487 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2488
2489 // Extract the upper half, since this is where we will find the sign and
2490 // exponent.
2491 SDValue Hi = getHiHalf64(Src, DAG);
2492
2493 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2494
2495 const unsigned FractBits = 52;
2496
2497 // Extract the sign bit.
2498 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2499 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2500
2501 // Extend back to 64-bits.
2502 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2503 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2504
2505 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2506 const SDValue FractMask
2507 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2508
2509 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2510 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2511 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2512
2513 EVT SetCCVT =
2514 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2515
2516 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2517
2518 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2519 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2520
2521 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2522 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2523
2524 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2525}
2526
2528 SelectionDAG &DAG) const {
2529 SDLoc SL(Op);
2530 SDValue Src = Op.getOperand(0);
2531
2532 assert(Op.getValueType() == MVT::f64);
2533
2534 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2535 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2536 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2537
2538 // TODO: Should this propagate fast-math-flags?
2539
2540 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2541 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2542
2543 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2544
2545 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2546 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2547
2548 EVT SetCCVT =
2549 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2550 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2551
2552 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2553}
2554
2556 SelectionDAG &DAG) const {
2557 // FNEARBYINT and FRINT are the same, except in their handling of FP
2558 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2559 // rint, so just treat them as equivalent.
2560 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2561 Op.getOperand(0));
2562}
2563
2565 auto VT = Op.getValueType();
2566 auto Arg = Op.getOperand(0u);
2567 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2568}
2569
2570// XXX - May require not supporting f32 denormals?
2571
2572// Don't handle v2f16. The extra instructions to scalarize and repack around the
2573// compare and vselect end up producing worse code than scalarizing the whole
2574// operation.
2576 SDLoc SL(Op);
2577 SDValue X = Op.getOperand(0);
2578 EVT VT = Op.getValueType();
2579
2580 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2581
2582 // TODO: Should this propagate fast-math-flags?
2583
2584 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2585
2586 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2587
2588 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2589 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2590
2591 EVT SetCCVT =
2592 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2593
2594 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2595 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2596 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2597
2598 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2599 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2600}
2601
2603 SDLoc SL(Op);
2604 SDValue Src = Op.getOperand(0);
2605
2606 // result = trunc(src);
2607 // if (src < 0.0 && src != result)
2608 // result += -1.0.
2609
2610 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2611
2612 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2613 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2614
2615 EVT SetCCVT =
2616 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2617
2618 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2619 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2620 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2621
2622 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2623 // TODO: Should this propagate fast-math-flags?
2624 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2625}
2626
2627/// Return true if it's known that \p Src can never be an f32 denormal value.
2629 switch (Src.getOpcode()) {
2630 case ISD::FP_EXTEND:
2631 return Src.getOperand(0).getValueType() == MVT::f16;
2632 case ISD::FP16_TO_FP:
2633 case ISD::FFREXP:
2634 return true;
2636 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2637 switch (IntrinsicID) {
2638 case Intrinsic::amdgcn_frexp_mant:
2639 return true;
2640 default:
2641 return false;
2642 }
2643 }
2644 default:
2645 return false;
2646 }
2647
2648 llvm_unreachable("covered opcode switch");
2649}
2650
2652 SDNodeFlags Flags) {
2653 return Flags.hasApproximateFuncs();
2654}
2655
2664
2666 SDValue Src,
2667 SDNodeFlags Flags) const {
2668 SDLoc SL(Src);
2669 EVT VT = Src.getValueType();
2670 const fltSemantics &Semantics = VT.getFltSemantics();
2671 SDValue SmallestNormal =
2672 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2673
2674 // Want to scale denormals up, but negatives and 0 work just as well on the
2675 // scaled path.
2676 SDValue IsLtSmallestNormal = DAG.getSetCC(
2677 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2678 SmallestNormal, ISD::SETOLT);
2679
2680 return IsLtSmallestNormal;
2681}
2682
2684 SDNodeFlags Flags) const {
2685 SDLoc SL(Src);
2686 EVT VT = Src.getValueType();
2687 const fltSemantics &Semantics = VT.getFltSemantics();
2688 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2689
2690 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2691 SDValue IsFinite = DAG.getSetCC(
2692 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2693 Inf, ISD::SETOLT);
2694 return IsFinite;
2695}
2696
2697/// If denormal handling is required return the scaled input to FLOG2, and the
2698/// check for denormal range. Otherwise, return null values.
2699std::pair<SDValue, SDValue>
2701 SDValue Src, SDNodeFlags Flags) const {
2702 if (!needsDenormHandlingF32(DAG, Src, Flags))
2703 return {};
2704
2705 MVT VT = MVT::f32;
2706 const fltSemantics &Semantics = APFloat::IEEEsingle();
2707 SDValue SmallestNormal =
2708 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2709
2710 SDValue IsLtSmallestNormal = DAG.getSetCC(
2711 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2712 SmallestNormal, ISD::SETOLT);
2713
2714 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2715 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2716 SDValue ScaleFactor =
2717 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2718
2719 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2720 return {ScaledInput, IsLtSmallestNormal};
2721}
2722
2724 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2725 // If we have to handle denormals, scale up the input and adjust the result.
2726
2727 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2728 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2729
2730 SDLoc SL(Op);
2731 EVT VT = Op.getValueType();
2732 SDValue Src = Op.getOperand(0);
2733 SDNodeFlags Flags = Op->getFlags();
2734
2735 if (VT == MVT::f16) {
2736 // Nothing in half is a denormal when promoted to f32.
2737 assert(!Subtarget->has16BitInsts());
2738 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2739 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2740 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2741 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2742 }
2743
2744 auto [ScaledInput, IsLtSmallestNormal] =
2745 getScaledLogInput(DAG, SL, Src, Flags);
2746 if (!ScaledInput)
2747 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2748
2749 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2750
2751 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2752 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2753 SDValue ResultOffset =
2754 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2755 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2756}
2757
2758static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2759 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2760 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2761 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2762}
2763
2765 SelectionDAG &DAG) const {
2766 SDValue X = Op.getOperand(0);
2767 EVT VT = Op.getValueType();
2768 SDNodeFlags Flags = Op->getFlags();
2769 SDLoc DL(Op);
2770
2771 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2772 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2773
2774 const auto &Options = getTargetMachine().Options;
2775 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2776
2777 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2778 // Log and multiply in f32 is good enough for f16.
2779 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2780 }
2781
2782 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2783 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2784 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2785 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2786 }
2787
2788 return Lowered;
2789 }
2790
2791 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2792 if (ScaledInput)
2793 X = ScaledInput;
2794
2795 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2796
2797 SDValue R;
2798 if (Subtarget->hasFastFMAF32()) {
2799 // c+cc are ln(2)/ln(10) to more than 49 bits
2800 const float c_log10 = 0x1.344134p-2f;
2801 const float cc_log10 = 0x1.09f79ep-26f;
2802
2803 // c + cc is ln(2) to more than 49 bits
2804 const float c_log = 0x1.62e42ep-1f;
2805 const float cc_log = 0x1.efa39ep-25f;
2806
2807 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2808 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2809
2810 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2811 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2812 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2813 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2814 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2815 } else {
2816 // ch+ct is ln(2)/ln(10) to more than 36 bits
2817 const float ch_log10 = 0x1.344000p-2f;
2818 const float ct_log10 = 0x1.3509f6p-18f;
2819
2820 // ch + ct is ln(2) to more than 36 bits
2821 const float ch_log = 0x1.62e000p-1f;
2822 const float ct_log = 0x1.0bfbe8p-15f;
2823
2824 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2825 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2826
2827 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2828 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2829 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2830 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2831 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2832
2833 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2834 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2835 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2836 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2837 }
2838
2839 const bool IsFiniteOnly =
2840 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2841
2842 // TODO: Check if known finite from source value.
2843 if (!IsFiniteOnly) {
2844 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2845 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2846 }
2847
2848 if (IsScaled) {
2849 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2850 SDValue ShiftK =
2851 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2852 SDValue Shift =
2853 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2854 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2855 }
2856
2857 return R;
2858}
2859
2863
2864// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2865// promote f16 operation.
2867 SelectionDAG &DAG, bool IsLog10,
2868 SDNodeFlags Flags) const {
2869 EVT VT = Src.getValueType();
2870 unsigned LogOp =
2871 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2872
2873 double Log2BaseInverted =
2875
2876 if (VT == MVT::f32) {
2877 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2878 if (ScaledInput) {
2879 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2880 SDValue ScaledResultOffset =
2881 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2882
2883 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2884
2885 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2886 ScaledResultOffset, Zero, Flags);
2887
2888 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2889
2890 if (Subtarget->hasFastFMAF32())
2891 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2892 Flags);
2893 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2894 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2895 }
2896 }
2897
2898 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2899 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2900
2901 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2902 Flags);
2903}
2904
2906 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2907 // If we have to handle denormals, scale up the input and adjust the result.
2908
2909 SDLoc SL(Op);
2910 EVT VT = Op.getValueType();
2911 SDValue Src = Op.getOperand(0);
2912 SDNodeFlags Flags = Op->getFlags();
2913
2914 if (VT == MVT::f16) {
2915 // Nothing in half is a denormal when promoted to f32.
2916 assert(!Subtarget->has16BitInsts());
2917 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2918 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2919 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2920 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2921 }
2922
2923 assert(VT == MVT::f32);
2924
2925 if (!needsDenormHandlingF32(DAG, Src, Flags))
2926 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2927
2928 // bool needs_scaling = x < -0x1.f80000p+6f;
2929 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2930
2931 // -nextafter(128.0, -1)
2932 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2933
2934 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2935
2936 SDValue NeedsScaling =
2937 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2938
2939 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2940 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2941
2942 SDValue AddOffset =
2943 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2944
2945 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2946 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2947
2948 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2949 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2950 SDValue ResultScale =
2951 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2952
2953 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2954}
2955
2957 SelectionDAG &DAG,
2958 SDNodeFlags Flags) const {
2959 EVT VT = X.getValueType();
2960 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2961
2962 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2963 // exp2(M_LOG2E_F * f);
2964 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2965 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2966 : (unsigned)ISD::FEXP2,
2967 SL, VT, Mul, Flags);
2968 }
2969
2970 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2971
2972 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2973 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2974
2975 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2976
2977 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2978
2979 SDValue AdjustedX =
2980 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2981
2982 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2983
2984 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2985
2986 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2987 SDValue AdjustedResult =
2988 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2989
2990 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2991 Flags);
2992}
2993
2994/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2995/// handled correctly.
2997 SelectionDAG &DAG,
2998 SDNodeFlags Flags) const {
2999 const EVT VT = X.getValueType();
3000 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3001 : static_cast<unsigned>(ISD::FEXP2);
3002
3003 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3004 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3005 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3006 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3007
3008 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3009 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3010 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3011 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3012 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3013 }
3014
3015 // bool s = x < -0x1.2f7030p+5f;
3016 // x += s ? 0x1.0p+5f : 0.0f;
3017 // exp10 = exp2(x * 0x1.a92000p+1f) *
3018 // exp2(x * 0x1.4f0978p-11f) *
3019 // (s ? 0x1.9f623ep-107f : 1.0f);
3020
3021 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3022
3023 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3024 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3025
3026 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3027 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3028 SDValue AdjustedX =
3029 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3030
3031 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3032 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3033
3034 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3035 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3036 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3037 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3038
3039 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3040
3041 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3042 SDValue AdjustedResult =
3043 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3044
3045 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3046 Flags);
3047}
3048
3050 EVT VT = Op.getValueType();
3051 SDLoc SL(Op);
3052 SDValue X = Op.getOperand(0);
3053 SDNodeFlags Flags = Op->getFlags();
3054 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3055
3056 if (VT.getScalarType() == MVT::f16) {
3057 // v_exp_f16 (fmul x, log2e)
3058 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3059 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3060
3061 if (VT.isVector())
3062 return SDValue();
3063
3064 // exp(f16 x) ->
3065 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3066
3067 // Nothing in half is a denormal when promoted to f32.
3068 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3069 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3070 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3071 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3072 }
3073
3074 assert(VT == MVT::f32);
3075
3076 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3077 // library behavior. Also, is known-not-daz source sufficient?
3078 if (allowApproxFunc(DAG, Flags)) {
3079 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3080 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3081 }
3082
3083 // Algorithm:
3084 //
3085 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3086 //
3087 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3088 // n = 64*m + j, 0 <= j < 64
3089 //
3090 // e^x = 2^((64*m + j + f)/64)
3091 // = (2^m) * (2^(j/64)) * 2^(f/64)
3092 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3093 //
3094 // f = x*(64/ln(2)) - n
3095 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3096 //
3097 // e^x = (2^m) * (2^(j/64)) * e^r
3098 //
3099 // (2^(j/64)) is precomputed
3100 //
3101 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3102 // e^r = 1 + q
3103 //
3104 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3105 //
3106 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3107 SDNodeFlags FlagsNoContract = Flags;
3108 FlagsNoContract.setAllowContract(false);
3109
3110 SDValue PH, PL;
3111 if (Subtarget->hasFastFMAF32()) {
3112 const float c_exp = numbers::log2ef;
3113 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3114 const float c_exp10 = 0x1.a934f0p+1f;
3115 const float cc_exp10 = 0x1.2f346ep-24f;
3116
3117 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3118 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3119
3120 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3121 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3122 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3123 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3124 } else {
3125 const float ch_exp = 0x1.714000p+0f;
3126 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3127
3128 const float ch_exp10 = 0x1.a92000p+1f;
3129 const float cl_exp10 = 0x1.4f0978p-11f;
3130
3131 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3132 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3133
3134 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3135 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3136 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3137 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3138 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3139
3140 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3141
3142 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3143 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3144 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3145 }
3146
3147 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3148
3149 // It is unsafe to contract this fsub into the PH multiply.
3150 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3151
3152 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3153 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3154 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3155
3156 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3157
3158 SDValue UnderflowCheckConst =
3159 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3160
3161 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3162 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3163 SDValue Underflow =
3164 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3165
3166 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3167
3168 if (!Flags.hasNoInfs()) {
3169 SDValue OverflowCheckConst =
3170 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3171 SDValue Overflow =
3172 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3173 SDValue Inf =
3175 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3176 }
3177
3178 return R;
3179}
3180
3181static bool isCtlzOpc(unsigned Opc) {
3182 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3183}
3184
3185static bool isCttzOpc(unsigned Opc) {
3186 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3187}
3188
3190 SelectionDAG &DAG) const {
3191 auto SL = SDLoc(Op);
3192 auto Opc = Op.getOpcode();
3193 auto Arg = Op.getOperand(0u);
3194 auto ResultVT = Op.getValueType();
3195
3196 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3197 return {};
3198
3200 assert(ResultVT == Arg.getValueType());
3201
3202 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3203 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3204 SDValue NewOp;
3205
3206 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3207 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3208 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3209 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3210 } else {
3211 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3212 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3213 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3214 }
3215
3216 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3217}
3218
3220 SDLoc SL(Op);
3221 SDValue Src = Op.getOperand(0);
3222
3223 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3224 bool Ctlz = isCtlzOpc(Op.getOpcode());
3225 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3226
3227 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3228 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3229 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3230
3231 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3232 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3233 // (cttz hi:lo) -> (umin (ffbl src), 32)
3234 // (ctlz_zero_undef src) -> (ffbh src)
3235 // (cttz_zero_undef src) -> (ffbl src)
3236
3237 // 64-bit scalar version produce 32-bit result
3238 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3239 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3240 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3241 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3242 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3243 if (!ZeroUndef) {
3244 const SDValue ConstVal = DAG.getConstant(
3245 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3246 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3247 }
3248 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3249 }
3250
3251 SDValue Lo, Hi;
3252 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3253
3254 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3255 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3256
3257 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3258 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3259 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3260 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3261
3262 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3263 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3264 if (Ctlz)
3265 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3266 else
3267 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3268
3269 SDValue NewOpr;
3270 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3271 if (!ZeroUndef) {
3272 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3273 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3274 }
3275
3276 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3277}
3278
3280 bool Signed) const {
3281 // The regular method converting a 64-bit integer to float roughly consists of
3282 // 2 steps: normalization and rounding. In fact, after normalization, the
3283 // conversion from a 64-bit integer to a float is essentially the same as the
3284 // one from a 32-bit integer. The only difference is that it has more
3285 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3286 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3287 // converted into the correct float number. The basic steps for the unsigned
3288 // conversion are illustrated in the following pseudo code:
3289 //
3290 // f32 uitofp(i64 u) {
3291 // i32 hi, lo = split(u);
3292 // // Only count the leading zeros in hi as we have native support of the
3293 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3294 // // reduced to a 32-bit one automatically.
3295 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3296 // u <<= shamt;
3297 // hi, lo = split(u);
3298 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3299 // // convert it as a 32-bit integer and scale the result back.
3300 // return uitofp(hi) * 2^(32 - shamt);
3301 // }
3302 //
3303 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3304 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3305 // converted instead followed by negation based its sign bit.
3306
3307 SDLoc SL(Op);
3308 SDValue Src = Op.getOperand(0);
3309
3310 SDValue Lo, Hi;
3311 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3312 SDValue Sign;
3313 SDValue ShAmt;
3314 if (Signed && Subtarget->isGCN()) {
3315 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3316 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3317 // account. That is, the maximal shift is
3318 // - 32 if Lo and Hi have opposite signs;
3319 // - 33 if Lo and Hi have the same sign.
3320 //
3321 // Or, MaxShAmt = 33 + OppositeSign, where
3322 //
3323 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3324 // - -1 if Lo and Hi have opposite signs; and
3325 // - 0 otherwise.
3326 //
3327 // All in all, ShAmt is calculated as
3328 //
3329 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3330 //
3331 // or
3332 //
3333 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3334 //
3335 // to reduce the critical path.
3336 SDValue OppositeSign = DAG.getNode(
3337 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3338 DAG.getConstant(31, SL, MVT::i32));
3339 SDValue MaxShAmt =
3340 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3341 OppositeSign);
3342 // Count the leading sign bits.
3343 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3344 // Different from unsigned conversion, the shift should be one bit less to
3345 // preserve the sign bit.
3346 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3347 DAG.getConstant(1, SL, MVT::i32));
3348 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3349 } else {
3350 if (Signed) {
3351 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3352 // absolute value first.
3353 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3354 DAG.getConstant(63, SL, MVT::i64));
3355 SDValue Abs =
3356 DAG.getNode(ISD::XOR, SL, MVT::i64,
3357 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3358 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3359 }
3360 // Count the leading zeros.
3361 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3362 // The shift amount for signed integers is [0, 32].
3363 }
3364 // Normalize the given 64-bit integer.
3365 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3366 // Split it again.
3367 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3368 // Calculate the adjust bit for rounding.
3369 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3370 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3371 DAG.getConstant(1, SL, MVT::i32), Lo);
3372 // Get the 32-bit normalized integer.
3373 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3374 // Convert the normalized 32-bit integer into f32.
3375 unsigned Opc =
3376 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3377 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3378
3379 // Finally, need to scale back the converted floating number as the original
3380 // 64-bit integer is converted as a 32-bit one.
3381 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3382 ShAmt);
3383 // On GCN, use LDEXP directly.
3384 if (Subtarget->isGCN())
3385 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3386
3387 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3388 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3389 // exponent is enough to avoid overflowing into the sign bit.
3390 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3391 DAG.getConstant(23, SL, MVT::i32));
3392 SDValue IVal =
3393 DAG.getNode(ISD::ADD, SL, MVT::i32,
3394 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3395 if (Signed) {
3396 // Set the sign bit.
3397 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3398 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3399 DAG.getConstant(31, SL, MVT::i32));
3400 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3401 }
3402 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3403}
3404
3406 bool Signed) const {
3407 SDLoc SL(Op);
3408 SDValue Src = Op.getOperand(0);
3409
3410 SDValue Lo, Hi;
3411 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3412
3414 SL, MVT::f64, Hi);
3415
3416 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3417
3418 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3419 DAG.getConstant(32, SL, MVT::i32));
3420 // TODO: Should this propagate fast-math-flags?
3421 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3422}
3423
3425 SelectionDAG &DAG) const {
3426 // TODO: Factor out code common with LowerSINT_TO_FP.
3427 EVT DestVT = Op.getValueType();
3428 SDValue Src = Op.getOperand(0);
3429 EVT SrcVT = Src.getValueType();
3430
3431 if (SrcVT == MVT::i16) {
3432 if (DestVT == MVT::f16)
3433 return Op;
3434 SDLoc DL(Op);
3435
3436 // Promote src to i32
3437 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3438 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3439 }
3440
3441 if (DestVT == MVT::bf16) {
3442 SDLoc SL(Op);
3443 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3444 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3445 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3446 }
3447
3448 if (SrcVT != MVT::i64)
3449 return Op;
3450
3451 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3452 SDLoc DL(Op);
3453
3454 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3455 SDValue FPRoundFlag =
3456 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3457 SDValue FPRound =
3458 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3459
3460 return FPRound;
3461 }
3462
3463 if (DestVT == MVT::f32)
3464 return LowerINT_TO_FP32(Op, DAG, false);
3465
3466 assert(DestVT == MVT::f64);
3467 return LowerINT_TO_FP64(Op, DAG, false);
3468}
3469
3471 SelectionDAG &DAG) const {
3472 EVT DestVT = Op.getValueType();
3473
3474 SDValue Src = Op.getOperand(0);
3475 EVT SrcVT = Src.getValueType();
3476
3477 if (SrcVT == MVT::i16) {
3478 if (DestVT == MVT::f16)
3479 return Op;
3480
3481 SDLoc DL(Op);
3482 // Promote src to i32
3483 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3484 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3485 }
3486
3487 if (DestVT == MVT::bf16) {
3488 SDLoc SL(Op);
3489 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3490 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3491 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3492 }
3493
3494 if (SrcVT != MVT::i64)
3495 return Op;
3496
3497 // TODO: Factor out code common with LowerUINT_TO_FP.
3498
3499 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3500 SDLoc DL(Op);
3501 SDValue Src = Op.getOperand(0);
3502
3503 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3504 SDValue FPRoundFlag =
3505 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3506 SDValue FPRound =
3507 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3508
3509 return FPRound;
3510 }
3511
3512 if (DestVT == MVT::f32)
3513 return LowerINT_TO_FP32(Op, DAG, true);
3514
3515 assert(DestVT == MVT::f64);
3516 return LowerINT_TO_FP64(Op, DAG, true);
3517}
3518
3520 bool Signed) const {
3521 SDLoc SL(Op);
3522
3523 SDValue Src = Op.getOperand(0);
3524 EVT SrcVT = Src.getValueType();
3525
3526 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3527
3528 // The basic idea of converting a floating point number into a pair of 32-bit
3529 // integers is illustrated as follows:
3530 //
3531 // tf := trunc(val);
3532 // hif := floor(tf * 2^-32);
3533 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3534 // hi := fptoi(hif);
3535 // lo := fptoi(lof);
3536 //
3537 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3538 SDValue Sign;
3539 if (Signed && SrcVT == MVT::f32) {
3540 // However, a 32-bit floating point number has only 23 bits mantissa and
3541 // it's not enough to hold all the significant bits of `lof` if val is
3542 // negative. To avoid the loss of precision, We need to take the absolute
3543 // value after truncating and flip the result back based on the original
3544 // signedness.
3545 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3546 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3547 DAG.getConstant(31, SL, MVT::i32));
3548 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3549 }
3550
3551 SDValue K0, K1;
3552 if (SrcVT == MVT::f64) {
3553 K0 = DAG.getConstantFP(
3554 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3555 SrcVT);
3556 K1 = DAG.getConstantFP(
3557 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3558 SrcVT);
3559 } else {
3560 K0 = DAG.getConstantFP(
3561 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3562 K1 = DAG.getConstantFP(
3563 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3564 }
3565 // TODO: Should this propagate fast-math-flags?
3566 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3567
3568 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3569
3570 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3571
3572 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3574 SL, MVT::i32, FloorMul);
3575 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3576
3577 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3578 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3579
3580 if (Signed && SrcVT == MVT::f32) {
3581 assert(Sign);
3582 // Flip the result based on the signedness, which is either all 0s or 1s.
3583 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3584 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3585 // r := xor(r, sign) - sign;
3586 Result =
3587 DAG.getNode(ISD::SUB, SL, MVT::i64,
3588 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3589 }
3590
3591 return Result;
3592}
3593
3595 SDLoc DL(Op);
3596 SDValue N0 = Op.getOperand(0);
3597
3598 // Convert to target node to get known bits
3599 if (N0.getValueType() == MVT::f32)
3600 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3601
3602 if (Op->getFlags().hasApproximateFuncs()) {
3603 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3604 return SDValue();
3605 }
3606
3607 return LowerF64ToF16Safe(N0, DL, DAG);
3608}
3609
3610// return node in i32
3612 SelectionDAG &DAG) const {
3613 assert(Src.getSimpleValueType() == MVT::f64);
3614
3615 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3616 // TODO: We can generate better code for True16.
3617 const unsigned ExpMask = 0x7ff;
3618 const unsigned ExpBiasf64 = 1023;
3619 const unsigned ExpBiasf16 = 15;
3620 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3621 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3622 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3623 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3624 DAG.getConstant(32, DL, MVT::i64));
3625 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3626 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3627 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3628 DAG.getConstant(20, DL, MVT::i64));
3629 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3630 DAG.getConstant(ExpMask, DL, MVT::i32));
3631 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3632 // add the f16 bias (15) to get the biased exponent for the f16 format.
3633 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3634 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3635
3636 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3637 DAG.getConstant(8, DL, MVT::i32));
3638 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3639 DAG.getConstant(0xffe, DL, MVT::i32));
3640
3641 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3642 DAG.getConstant(0x1ff, DL, MVT::i32));
3643 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3644
3645 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3646 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3647
3648 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3649 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3650 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3651 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3652
3653 // N = M | (E << 12);
3654 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3655 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3656 DAG.getConstant(12, DL, MVT::i32)));
3657
3658 // B = clamp(1-E, 0, 13);
3659 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3660 One, E);
3661 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3662 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3663 DAG.getConstant(13, DL, MVT::i32));
3664
3665 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3666 DAG.getConstant(0x1000, DL, MVT::i32));
3667
3668 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3669 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3670 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3671 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3672
3673 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3674 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3675 DAG.getConstant(0x7, DL, MVT::i32));
3676 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3677 DAG.getConstant(2, DL, MVT::i32));
3678 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3679 One, Zero, ISD::SETEQ);
3680 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3681 One, Zero, ISD::SETGT);
3682 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3683 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3684
3685 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3686 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3687 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3688 I, V, ISD::SETEQ);
3689
3690 // Extract the sign bit.
3691 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3692 DAG.getConstant(16, DL, MVT::i32));
3693 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3694 DAG.getConstant(0x8000, DL, MVT::i32));
3695
3696 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3697}
3698
3700 SelectionDAG &DAG) const {
3701 SDValue Src = Op.getOperand(0);
3702 unsigned OpOpcode = Op.getOpcode();
3703 EVT SrcVT = Src.getValueType();
3704 EVT DestVT = Op.getValueType();
3705
3706 // Will be selected natively
3707 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3708 return Op;
3709
3710 if (SrcVT == MVT::bf16) {
3711 SDLoc DL(Op);
3712 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3713 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3714 }
3715
3716 // Promote i16 to i32
3717 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3718 SDLoc DL(Op);
3719
3720 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3721 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3722 }
3723
3724 if (DestVT != MVT::i64)
3725 return Op;
3726
3727 if (SrcVT == MVT::f16 ||
3728 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3729 SDLoc DL(Op);
3730
3731 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3732 unsigned Ext =
3734 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3735 }
3736
3737 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3738 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3739
3740 return SDValue();
3741}
3742
3744 SelectionDAG &DAG) const {
3745 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3746 MVT VT = Op.getSimpleValueType();
3747 MVT ScalarVT = VT.getScalarType();
3748
3749 assert(VT.isVector());
3750
3751 SDValue Src = Op.getOperand(0);
3752 SDLoc DL(Op);
3753
3754 // TODO: Don't scalarize on Evergreen?
3755 unsigned NElts = VT.getVectorNumElements();
3757 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3758
3759 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3760 for (unsigned I = 0; I < NElts; ++I)
3761 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3762
3763 return DAG.getBuildVector(VT, DL, Args);
3764}
3765
3766//===----------------------------------------------------------------------===//
3767// Custom DAG optimizations
3768//===----------------------------------------------------------------------===//
3769
3770static bool isU24(SDValue Op, SelectionDAG &DAG) {
3771 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3772}
3773
3774static bool isI24(SDValue Op, SelectionDAG &DAG) {
3775 EVT VT = Op.getValueType();
3776 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3777 // as unsigned 24-bit values.
3779}
3780
3783 SelectionDAG &DAG = DCI.DAG;
3784 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3785 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3786
3787 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3788 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3789 unsigned NewOpcode = Node24->getOpcode();
3790 if (IsIntrin) {
3791 unsigned IID = Node24->getConstantOperandVal(0);
3792 switch (IID) {
3793 case Intrinsic::amdgcn_mul_i24:
3794 NewOpcode = AMDGPUISD::MUL_I24;
3795 break;
3796 case Intrinsic::amdgcn_mul_u24:
3797 NewOpcode = AMDGPUISD::MUL_U24;
3798 break;
3799 case Intrinsic::amdgcn_mulhi_i24:
3800 NewOpcode = AMDGPUISD::MULHI_I24;
3801 break;
3802 case Intrinsic::amdgcn_mulhi_u24:
3803 NewOpcode = AMDGPUISD::MULHI_U24;
3804 break;
3805 default:
3806 llvm_unreachable("Expected 24-bit mul intrinsic");
3807 }
3808 }
3809
3810 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3811
3812 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3813 // the operands to have other uses, but will only perform simplifications that
3814 // involve bypassing some nodes for this user.
3815 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3816 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3817 if (DemandedLHS || DemandedRHS)
3818 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3819 DemandedLHS ? DemandedLHS : LHS,
3820 DemandedRHS ? DemandedRHS : RHS);
3821
3822 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3823 // operands if this node is the only user.
3824 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3825 return SDValue(Node24, 0);
3826 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3827 return SDValue(Node24, 0);
3828
3829 return SDValue();
3830}
3831
3832template <typename IntTy>
3834 uint32_t Width, const SDLoc &DL) {
3835 if (Width + Offset < 32) {
3836 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3837 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3838 if constexpr (std::is_signed_v<IntTy>) {
3839 return DAG.getSignedConstant(Result, DL, MVT::i32);
3840 } else {
3841 return DAG.getConstant(Result, DL, MVT::i32);
3842 }
3843 }
3844
3845 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3846}
3847
3848static bool hasVolatileUser(SDNode *Val) {
3849 for (SDNode *U : Val->users()) {
3850 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3851 if (M->isVolatile())
3852 return true;
3853 }
3854 }
3855
3856 return false;
3857}
3858
3860 // i32 vectors are the canonical memory type.
3861 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3862 return false;
3863
3864 if (!VT.isByteSized())
3865 return false;
3866
3867 unsigned Size = VT.getStoreSize();
3868
3869 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3870 return false;
3871
3872 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3873 return false;
3874
3875 return true;
3876}
3877
3878// Replace load of an illegal type with a bitcast from a load of a friendlier
3879// type.
3881 DAGCombinerInfo &DCI) const {
3882 if (!DCI.isBeforeLegalize())
3883 return SDValue();
3884
3886 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3887 return SDValue();
3888
3889 SDLoc SL(N);
3890 SelectionDAG &DAG = DCI.DAG;
3891 EVT VT = LN->getMemoryVT();
3892
3893 unsigned Size = VT.getStoreSize();
3894 Align Alignment = LN->getAlign();
3895 if (Alignment < Size && isTypeLegal(VT)) {
3896 unsigned IsFast;
3897 unsigned AS = LN->getAddressSpace();
3898
3899 // Expand unaligned loads earlier than legalization. Due to visitation order
3900 // problems during legalization, the emitted instructions to pack and unpack
3901 // the bytes again are not eliminated in the case of an unaligned copy.
3903 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3904 if (VT.isVector())
3905 return SplitVectorLoad(SDValue(LN, 0), DAG);
3906
3907 SDValue Ops[2];
3908 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3909
3910 return DAG.getMergeValues(Ops, SDLoc(N));
3911 }
3912
3913 if (!IsFast)
3914 return SDValue();
3915 }
3916
3917 if (!shouldCombineMemoryType(VT))
3918 return SDValue();
3919
3920 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3921
3922 SDValue NewLoad
3923 = DAG.getLoad(NewVT, SL, LN->getChain(),
3924 LN->getBasePtr(), LN->getMemOperand());
3925
3926 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3927 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3928 return SDValue(N, 0);
3929}
3930
3931// Replace store of an illegal type with a store of a bitcast to a friendlier
3932// type.
3934 DAGCombinerInfo &DCI) const {
3935 if (!DCI.isBeforeLegalize())
3936 return SDValue();
3937
3939 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3940 return SDValue();
3941
3942 EVT VT = SN->getMemoryVT();
3943 unsigned Size = VT.getStoreSize();
3944
3945 SDLoc SL(N);
3946 SelectionDAG &DAG = DCI.DAG;
3947 Align Alignment = SN->getAlign();
3948 if (Alignment < Size && isTypeLegal(VT)) {
3949 unsigned IsFast;
3950 unsigned AS = SN->getAddressSpace();
3951
3952 // Expand unaligned stores earlier than legalization. Due to visitation
3953 // order problems during legalization, the emitted instructions to pack and
3954 // unpack the bytes again are not eliminated in the case of an unaligned
3955 // copy.
3957 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3958 if (VT.isVector())
3959 return SplitVectorStore(SDValue(SN, 0), DAG);
3960
3961 return expandUnalignedStore(SN, DAG);
3962 }
3963
3964 if (!IsFast)
3965 return SDValue();
3966 }
3967
3968 if (!shouldCombineMemoryType(VT))
3969 return SDValue();
3970
3971 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3972 SDValue Val = SN->getValue();
3973
3974 //DCI.AddToWorklist(Val.getNode());
3975
3976 bool OtherUses = !Val.hasOneUse();
3977 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3978 if (OtherUses) {
3979 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3980 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3981 }
3982
3983 return DAG.getStore(SN->getChain(), SL, CastVal,
3984 SN->getBasePtr(), SN->getMemOperand());
3985}
3986
3987// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3988// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3989// issues.
3991 DAGCombinerInfo &DCI) const {
3992 SelectionDAG &DAG = DCI.DAG;
3993 SDValue N0 = N->getOperand(0);
3994
3995 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3996 // (vt2 (truncate (assertzext vt0:x, vt1)))
3997 if (N0.getOpcode() == ISD::TRUNCATE) {
3998 SDValue N1 = N->getOperand(1);
3999 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4000 SDLoc SL(N);
4001
4002 SDValue Src = N0.getOperand(0);
4003 EVT SrcVT = Src.getValueType();
4004 if (SrcVT.bitsGE(ExtVT)) {
4005 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4006 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4007 }
4008 }
4009
4010 return SDValue();
4011}
4012
4014 SDNode *N, DAGCombinerInfo &DCI) const {
4015 unsigned IID = N->getConstantOperandVal(0);
4016 switch (IID) {
4017 case Intrinsic::amdgcn_mul_i24:
4018 case Intrinsic::amdgcn_mul_u24:
4019 case Intrinsic::amdgcn_mulhi_i24:
4020 case Intrinsic::amdgcn_mulhi_u24:
4021 return simplifyMul24(N, DCI);
4022 case Intrinsic::amdgcn_fract:
4023 case Intrinsic::amdgcn_rsq:
4024 case Intrinsic::amdgcn_rcp_legacy:
4025 case Intrinsic::amdgcn_rsq_legacy:
4026 case Intrinsic::amdgcn_rsq_clamp:
4027 case Intrinsic::amdgcn_tanh:
4028 case Intrinsic::amdgcn_prng_b32: {
4029 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4030 SDValue Src = N->getOperand(1);
4031 return Src.isUndef() ? Src : SDValue();
4032 }
4033 case Intrinsic::amdgcn_frexp_exp: {
4034 // frexp_exp (fneg x) -> frexp_exp x
4035 // frexp_exp (fabs x) -> frexp_exp x
4036 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4037 SDValue Src = N->getOperand(1);
4038 SDValue PeekSign = peekFPSignOps(Src);
4039 if (PeekSign == Src)
4040 return SDValue();
4041 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4042 0);
4043 }
4044 default:
4045 return SDValue();
4046 }
4047}
4048
4049/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4050/// binary operation \p Opc to it with the corresponding constant operands.
4052 DAGCombinerInfo &DCI, const SDLoc &SL,
4053 unsigned Opc, SDValue LHS,
4054 uint32_t ValLo, uint32_t ValHi) const {
4055 SelectionDAG &DAG = DCI.DAG;
4056 SDValue Lo, Hi;
4057 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4058
4059 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4060 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4061
4062 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4063 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4064
4065 // Re-visit the ands. It's possible we eliminated one of them and it could
4066 // simplify the vector.
4067 DCI.AddToWorklist(Lo.getNode());
4068 DCI.AddToWorklist(Hi.getNode());
4069
4070 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4071 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4072}
4073
4075 DAGCombinerInfo &DCI) const {
4076 EVT VT = N->getValueType(0);
4077 SDValue LHS = N->getOperand(0);
4078 SDValue RHS = N->getOperand(1);
4080 SDLoc SL(N);
4081 SelectionDAG &DAG = DCI.DAG;
4082
4083 unsigned RHSVal;
4084 if (CRHS) {
4085 RHSVal = CRHS->getZExtValue();
4086 if (!RHSVal)
4087 return LHS;
4088
4089 switch (LHS->getOpcode()) {
4090 default:
4091 break;
4092 case ISD::ZERO_EXTEND:
4093 case ISD::SIGN_EXTEND:
4094 case ISD::ANY_EXTEND: {
4095 SDValue X = LHS->getOperand(0);
4096
4097 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4098 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4099 // Prefer build_vector as the canonical form if packed types are legal.
4100 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4101 SDValue Vec = DAG.getBuildVector(
4102 MVT::v2i16, SL,
4103 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4104 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4105 }
4106
4107 // shl (ext x) => zext (shl x), if shift does not overflow int
4108 if (VT != MVT::i64)
4109 break;
4110 KnownBits Known = DAG.computeKnownBits(X);
4111 unsigned LZ = Known.countMinLeadingZeros();
4112 if (LZ < RHSVal)
4113 break;
4114 EVT XVT = X.getValueType();
4115 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4116 return DAG.getZExtOrTrunc(Shl, SL, VT);
4117 }
4118 }
4119 }
4120
4121 if (VT.getScalarType() != MVT::i64)
4122 return SDValue();
4123
4124 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4125 // common case, splitting this into a move and a 32-bit shift is faster and
4126 // the same code size.
4127 KnownBits Known = DAG.computeKnownBits(RHS);
4128
4129 EVT ElementType = VT.getScalarType();
4130 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4131 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4132 : TargetScalarType;
4133
4134 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4135 return SDValue();
4136 SDValue ShiftAmt;
4137
4138 if (CRHS) {
4139 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4140 TargetType);
4141 } else {
4142 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4143 const SDValue ShiftMask =
4144 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4145 // This AND instruction will clamp out of bounds shift values.
4146 // It will also be removed during later instruction selection.
4147 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4148 }
4149
4150 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4151 SDValue NewShift =
4152 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4153
4154 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4155 SDValue Vec;
4156
4157 if (VT.isVector()) {
4158 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4159 unsigned NElts = TargetType.getVectorNumElements();
4161 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4162
4163 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4164 for (unsigned I = 0; I != NElts; ++I)
4165 HiAndLoOps[2 * I + 1] = HiOps[I];
4166 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4167 } else {
4168 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4169 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4170 }
4171 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4172}
4173
4175 DAGCombinerInfo &DCI) const {
4176 SDValue RHS = N->getOperand(1);
4178 EVT VT = N->getValueType(0);
4179 SDValue LHS = N->getOperand(0);
4180 SelectionDAG &DAG = DCI.DAG;
4181 SDLoc SL(N);
4182
4183 if (VT.getScalarType() != MVT::i64)
4184 return SDValue();
4185
4186 // For C >= 32
4187 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4188
4189 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4190 // common case, splitting this into a move and a 32-bit shift is faster and
4191 // the same code size.
4192 KnownBits Known = DAG.computeKnownBits(RHS);
4193
4194 EVT ElementType = VT.getScalarType();
4195 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4196 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4197 : TargetScalarType;
4198
4199 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4200 return SDValue();
4201
4202 SDValue ShiftFullAmt =
4203 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4204 SDValue ShiftAmt;
4205 if (CRHS) {
4206 unsigned RHSVal = CRHS->getZExtValue();
4207 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4208 TargetType);
4209 } else if (Known.getMinValue().getZExtValue() ==
4210 (ElementType.getSizeInBits() - 1)) {
4211 ShiftAmt = ShiftFullAmt;
4212 } else {
4213 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4214 const SDValue ShiftMask =
4215 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4216 // This AND instruction will clamp out of bounds shift values.
4217 // It will also be removed during later instruction selection.
4218 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4219 }
4220
4221 EVT ConcatType;
4222 SDValue Hi;
4223 SDLoc LHSSL(LHS);
4224 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4225 if (VT.isVector()) {
4226 unsigned NElts = TargetType.getVectorNumElements();
4227 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4228 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4229 SmallVector<SDValue, 8> HiOps(NElts);
4230 SmallVector<SDValue, 16> HiAndLoOps;
4231
4232 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4233 for (unsigned I = 0; I != NElts; ++I) {
4234 HiOps[I] = HiAndLoOps[2 * I + 1];
4235 }
4236 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4237 } else {
4238 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4239 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4240 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4241 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4242 }
4243
4244 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4245 SDValue HiShift;
4246 if (KnownLHS.isNegative()) {
4247 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4248 } else {
4249 Hi = DAG.getFreeze(Hi);
4250 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4251 }
4252 SDValue NewShift =
4253 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4254
4255 SDValue Vec;
4256 if (VT.isVector()) {
4257 unsigned NElts = TargetType.getVectorNumElements();
4260 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4261
4262 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4263 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4264 for (unsigned I = 0; I != NElts; ++I) {
4265 HiAndLoOps[2 * I + 1] = HiOps[I];
4266 HiAndLoOps[2 * I] = LoOps[I];
4267 }
4268 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4269 } else {
4270 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4271 }
4272 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4273}
4274
4276 DAGCombinerInfo &DCI) const {
4277 SDValue RHS = N->getOperand(1);
4279 EVT VT = N->getValueType(0);
4280 SDValue LHS = N->getOperand(0);
4281 SelectionDAG &DAG = DCI.DAG;
4282 SDLoc SL(N);
4283 unsigned RHSVal;
4284
4285 if (CRHS) {
4286 RHSVal = CRHS->getZExtValue();
4287
4288 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4289 // this improves the ability to match BFE patterns in isel.
4290 if (LHS.getOpcode() == ISD::AND) {
4291 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4292 unsigned MaskIdx, MaskLen;
4293 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4294 MaskIdx == RHSVal) {
4295 return DAG.getNode(ISD::AND, SL, VT,
4296 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4297 N->getOperand(1)),
4298 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4299 N->getOperand(1)));
4300 }
4301 }
4302 }
4303 }
4304
4305 if (VT.getScalarType() != MVT::i64)
4306 return SDValue();
4307
4308 // for C >= 32
4309 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4310
4311 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4312 // common case, splitting this into a move and a 32-bit shift is faster and
4313 // the same code size.
4314 KnownBits Known = DAG.computeKnownBits(RHS);
4315
4316 EVT ElementType = VT.getScalarType();
4317 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4318 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4319 : TargetScalarType;
4320
4321 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4322 return SDValue();
4323
4324 SDValue ShiftAmt;
4325 if (CRHS) {
4326 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4327 TargetType);
4328 } else {
4329 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4330 const SDValue ShiftMask =
4331 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4332 // This AND instruction will clamp out of bounds shift values.
4333 // It will also be removed during later instruction selection.
4334 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4335 }
4336
4337 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4338 EVT ConcatType;
4339 SDValue Hi;
4340 SDLoc LHSSL(LHS);
4341 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4342 if (VT.isVector()) {
4343 unsigned NElts = TargetType.getVectorNumElements();
4344 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4345 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4346 SmallVector<SDValue, 8> HiOps(NElts);
4347 SmallVector<SDValue, 16> HiAndLoOps;
4348
4349 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4350 for (unsigned I = 0; I != NElts; ++I)
4351 HiOps[I] = HiAndLoOps[2 * I + 1];
4352 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4353 } else {
4354 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4355 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4356 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4357 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4358 }
4359
4360 SDValue NewShift =
4361 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4362
4363 SDValue Vec;
4364 if (VT.isVector()) {
4365 unsigned NElts = TargetType.getVectorNumElements();
4367 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4368
4369 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4370 for (unsigned I = 0; I != NElts; ++I)
4371 HiAndLoOps[2 * I] = LoOps[I];
4372 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4373 } else {
4374 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4375 }
4376 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4377}
4378
4380 SDNode *N, DAGCombinerInfo &DCI) const {
4381 SDLoc SL(N);
4382 SelectionDAG &DAG = DCI.DAG;
4383 EVT VT = N->getValueType(0);
4384 SDValue Src = N->getOperand(0);
4385
4386 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4387 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4388 SDValue Vec = Src.getOperand(0);
4389 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4390 SDValue Elt0 = Vec.getOperand(0);
4391 EVT EltVT = Elt0.getValueType();
4392 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4393 if (EltVT.isFloatingPoint()) {
4394 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4395 EltVT.changeTypeToInteger(), Elt0);
4396 }
4397
4398 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4399 }
4400 }
4401 }
4402
4403 // Equivalent of above for accessing the high element of a vector as an
4404 // integer operation.
4405 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4406 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4407 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4408 SDValue BV = stripBitcast(Src.getOperand(0));
4409 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4410 EVT SrcEltVT = BV.getOperand(0).getValueType();
4411 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4412 unsigned BitIndex = K->getZExtValue();
4413 unsigned PartIndex = BitIndex / SrcEltSize;
4414
4415 if (PartIndex * SrcEltSize == BitIndex &&
4416 PartIndex < BV.getNumOperands()) {
4417 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4418 SDValue SrcElt =
4419 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4420 BV.getOperand(PartIndex));
4421 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4422 }
4423 }
4424 }
4425 }
4426 }
4427
4428 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4429 //
4430 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4431 // i16 (trunc (srl (i32 (trunc x), K)))
4432 if (VT.getScalarSizeInBits() < 32) {
4433 EVT SrcVT = Src.getValueType();
4434 if (SrcVT.getScalarSizeInBits() > 32 &&
4435 (Src.getOpcode() == ISD::SRL ||
4436 Src.getOpcode() == ISD::SRA ||
4437 Src.getOpcode() == ISD::SHL)) {
4438 SDValue Amt = Src.getOperand(1);
4439 KnownBits Known = DAG.computeKnownBits(Amt);
4440
4441 // - For left shifts, do the transform as long as the shift
4442 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4443 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4444 // losing information stored in the high bits when truncating.
4445 const unsigned MaxCstSize =
4446 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4447 if (Known.getMaxValue().ule(MaxCstSize)) {
4448 EVT MidVT = VT.isVector() ?
4449 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4450 VT.getVectorNumElements()) : MVT::i32;
4451
4452 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4453 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4454 Src.getOperand(0));
4455 DCI.AddToWorklist(Trunc.getNode());
4456
4457 if (Amt.getValueType() != NewShiftVT) {
4458 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4459 DCI.AddToWorklist(Amt.getNode());
4460 }
4461
4462 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4463 Trunc, Amt);
4464 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4465 }
4466 }
4467 }
4468
4469 return SDValue();
4470}
4471
4472// We need to specifically handle i64 mul here to avoid unnecessary conversion
4473// instructions. If we only match on the legalized i64 mul expansion,
4474// SimplifyDemandedBits will be unable to remove them because there will be
4475// multiple uses due to the separate mul + mulh[su].
4476static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4477 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4478 if (Size <= 32) {
4479 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4480 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4481 }
4482
4483 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4484 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4485
4486 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4487 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4488
4489 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4490}
4491
4492/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4493/// return SDValue().
4494static SDValue getAddOneOp(const SDNode *V) {
4495 if (V->getOpcode() != ISD::ADD)
4496 return SDValue();
4497
4498 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4499}
4500
4502 DAGCombinerInfo &DCI) const {
4503 assert(N->getOpcode() == ISD::MUL);
4504 EVT VT = N->getValueType(0);
4505
4506 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4507 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4508 // unnecessarily). isDivergent() is used as an approximation of whether the
4509 // value is in an SGPR.
4510 if (!N->isDivergent())
4511 return SDValue();
4512
4513 unsigned Size = VT.getSizeInBits();
4514 if (VT.isVector() || Size > 64)
4515 return SDValue();
4516
4517 SelectionDAG &DAG = DCI.DAG;
4518 SDLoc DL(N);
4519
4520 SDValue N0 = N->getOperand(0);
4521 SDValue N1 = N->getOperand(1);
4522
4523 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4524 // matching.
4525
4526 // mul x, (add y, 1) -> add (mul x, y), x
4527 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4528 SDValue AddOp = getAddOneOp(V.getNode());
4529 if (!AddOp)
4530 return SDValue();
4531
4532 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4533 return U->getOpcode() == ISD::MUL;
4534 }))
4535 return AddOp;
4536
4537 return SDValue();
4538 };
4539
4540 // FIXME: The selection pattern is not properly checking for commuted
4541 // operands, so we have to place the mul in the LHS
4542 if (SDValue MulOper = IsFoldableAdd(N0)) {
4543 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4544 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4545 }
4546
4547 if (SDValue MulOper = IsFoldableAdd(N1)) {
4548 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4549 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4550 }
4551
4552 // There are i16 integer mul/mad.
4553 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4554 return SDValue();
4555
4556 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4557 // in the source into any_extends if the result of the mul is truncated. Since
4558 // we can assume the high bits are whatever we want, use the underlying value
4559 // to avoid the unknown high bits from interfering.
4560 if (N0.getOpcode() == ISD::ANY_EXTEND)
4561 N0 = N0.getOperand(0);
4562
4563 if (N1.getOpcode() == ISD::ANY_EXTEND)
4564 N1 = N1.getOperand(0);
4565
4566 SDValue Mul;
4567
4568 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4569 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4570 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4571 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4572 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4573 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4574 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4575 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4576 } else {
4577 return SDValue();
4578 }
4579
4580 // We need to use sext even for MUL_U24, because MUL_U24 is used
4581 // for signed multiply of 8 and 16-bit types.
4582 return DAG.getSExtOrTrunc(Mul, DL, VT);
4583}
4584
4585SDValue
4587 DAGCombinerInfo &DCI) const {
4588 if (N->getValueType(0) != MVT::i32)
4589 return SDValue();
4590
4591 SelectionDAG &DAG = DCI.DAG;
4592 SDLoc DL(N);
4593
4594 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4595 SDValue N0 = N->getOperand(0);
4596 SDValue N1 = N->getOperand(1);
4597
4598 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4599 // in the source into any_extends if the result of the mul is truncated. Since
4600 // we can assume the high bits are whatever we want, use the underlying value
4601 // to avoid the unknown high bits from interfering.
4602 if (N0.getOpcode() == ISD::ANY_EXTEND)
4603 N0 = N0.getOperand(0);
4604 if (N1.getOpcode() == ISD::ANY_EXTEND)
4605 N1 = N1.getOperand(0);
4606
4607 // Try to use two fast 24-bit multiplies (one for each half of the result)
4608 // instead of one slow extending multiply.
4609 unsigned LoOpcode = 0;
4610 unsigned HiOpcode = 0;
4611 if (Signed) {
4612 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4613 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4614 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4615 LoOpcode = AMDGPUISD::MUL_I24;
4616 HiOpcode = AMDGPUISD::MULHI_I24;
4617 }
4618 } else {
4619 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4620 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4621 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4622 LoOpcode = AMDGPUISD::MUL_U24;
4623 HiOpcode = AMDGPUISD::MULHI_U24;
4624 }
4625 }
4626 if (!LoOpcode)
4627 return SDValue();
4628
4629 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4630 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4631 DCI.CombineTo(N, Lo, Hi);
4632 return SDValue(N, 0);
4633}
4634
4636 DAGCombinerInfo &DCI) const {
4637 EVT VT = N->getValueType(0);
4638
4639 if (!Subtarget->hasMulI24() || VT.isVector())
4640 return SDValue();
4641
4642 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4643 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4644 // unnecessarily). isDivergent() is used as an approximation of whether the
4645 // value is in an SGPR.
4646 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4647 // valu op anyway)
4648 if (Subtarget->hasSMulHi() && !N->isDivergent())
4649 return SDValue();
4650
4651 SelectionDAG &DAG = DCI.DAG;
4652 SDLoc DL(N);
4653
4654 SDValue N0 = N->getOperand(0);
4655 SDValue N1 = N->getOperand(1);
4656
4657 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4658 return SDValue();
4659
4660 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4661 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4662
4663 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4664 DCI.AddToWorklist(Mulhi.getNode());
4665 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4666}
4667
4669 DAGCombinerInfo &DCI) const {
4670 EVT VT = N->getValueType(0);
4671
4672 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4673 return SDValue();
4674
4675 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4676 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4677 // unnecessarily). isDivergent() is used as an approximation of whether the
4678 // value is in an SGPR.
4679 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4680 // valu op anyway)
4681 if (Subtarget->hasSMulHi() && !N->isDivergent())
4682 return SDValue();
4683
4684 SelectionDAG &DAG = DCI.DAG;
4685 SDLoc DL(N);
4686
4687 SDValue N0 = N->getOperand(0);
4688 SDValue N1 = N->getOperand(1);
4689
4690 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4691 return SDValue();
4692
4693 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4694 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4695
4696 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4697 DCI.AddToWorklist(Mulhi.getNode());
4698 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4699}
4700
4701SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4702 SDValue Op,
4703 const SDLoc &DL,
4704 unsigned Opc) const {
4705 EVT VT = Op.getValueType();
4706 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4707 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4708 LegalVT != MVT::i16))
4709 return SDValue();
4710
4711 if (VT != MVT::i32)
4712 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4713
4714 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4715 if (VT != MVT::i32)
4716 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4717
4718 return FFBX;
4719}
4720
4721// The native instructions return -1 on 0 input. Optimize out a select that
4722// produces -1 on 0.
4723//
4724// TODO: If zero is not undef, we could also do this if the output is compared
4725// against the bitwidth.
4726//
4727// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4729 SDValue LHS, SDValue RHS,
4730 DAGCombinerInfo &DCI) const {
4731 if (!isNullConstant(Cond.getOperand(1)))
4732 return SDValue();
4733
4734 SelectionDAG &DAG = DCI.DAG;
4735 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4736 SDValue CmpLHS = Cond.getOperand(0);
4737
4738 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4739 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4740 if (CCOpcode == ISD::SETEQ &&
4741 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4742 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4743 unsigned Opc =
4744 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4745 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4746 }
4747
4748 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4749 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4750 if (CCOpcode == ISD::SETNE &&
4751 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4752 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4753 unsigned Opc =
4754 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4755
4756 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4757 }
4758
4759 return SDValue();
4760}
4761
4763 unsigned Op,
4764 const SDLoc &SL,
4765 SDValue Cond,
4766 SDValue N1,
4767 SDValue N2) {
4768 SelectionDAG &DAG = DCI.DAG;
4769 EVT VT = N1.getValueType();
4770
4771 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4772 N1.getOperand(0), N2.getOperand(0));
4773 DCI.AddToWorklist(NewSelect.getNode());
4774 return DAG.getNode(Op, SL, VT, NewSelect);
4775}
4776
4777// Pull a free FP operation out of a select so it may fold into uses.
4778//
4779// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4780// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4781//
4782// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4783// select c, (fabs x), +k -> fabs (select c, x, k)
4784SDValue
4786 SDValue N) const {
4787 SelectionDAG &DAG = DCI.DAG;
4788 SDValue Cond = N.getOperand(0);
4789 SDValue LHS = N.getOperand(1);
4790 SDValue RHS = N.getOperand(2);
4791
4792 EVT VT = N.getValueType();
4793 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4794 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4796 return SDValue();
4797
4798 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4799 SDLoc(N), Cond, LHS, RHS);
4800 }
4801
4802 bool Inv = false;
4803 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4804 std::swap(LHS, RHS);
4805 Inv = true;
4806 }
4807
4808 // TODO: Support vector constants.
4810 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4811 !selectSupportsSourceMods(N.getNode())) {
4812 SDLoc SL(N);
4813 // If one side is an fneg/fabs and the other is a constant, we can push the
4814 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4815 SDValue NewLHS = LHS.getOperand(0);
4816 SDValue NewRHS = RHS;
4817
4818 // Careful: if the neg can be folded up, don't try to pull it back down.
4819 bool ShouldFoldNeg = true;
4820
4821 if (NewLHS.hasOneUse()) {
4822 unsigned Opc = NewLHS.getOpcode();
4823 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4824 ShouldFoldNeg = false;
4825 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4826 ShouldFoldNeg = false;
4827 }
4828
4829 if (ShouldFoldNeg) {
4830 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4831 return SDValue();
4832
4833 // We're going to be forced to use a source modifier anyway, there's no
4834 // point to pulling the negate out unless we can get a size reduction by
4835 // negating the constant.
4836 //
4837 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4838 // about cheaper constants.
4839 if (NewLHS.getOpcode() == ISD::FABS &&
4841 return SDValue();
4842
4844 return SDValue();
4845
4846 if (LHS.getOpcode() == ISD::FNEG)
4847 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4848
4849 if (Inv)
4850 std::swap(NewLHS, NewRHS);
4851
4852 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4853 Cond, NewLHS, NewRHS);
4854 DCI.AddToWorklist(NewSelect.getNode());
4855 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4856 }
4857 }
4858
4859 return SDValue();
4860}
4861
4863 DAGCombinerInfo &DCI) const {
4864 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4865 return Folded;
4866
4867 SDValue Cond = N->getOperand(0);
4868 if (Cond.getOpcode() != ISD::SETCC)
4869 return SDValue();
4870
4871 EVT VT = N->getValueType(0);
4872 SDValue LHS = Cond.getOperand(0);
4873 SDValue RHS = Cond.getOperand(1);
4874 SDValue CC = Cond.getOperand(2);
4875
4876 SDValue True = N->getOperand(1);
4877 SDValue False = N->getOperand(2);
4878
4879 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4880 SelectionDAG &DAG = DCI.DAG;
4881 if (DAG.isConstantValueOfAnyType(True) &&
4882 !DAG.isConstantValueOfAnyType(False)) {
4883 // Swap cmp + select pair to move constant to false input.
4884 // This will allow using VOPC cndmasks more often.
4885 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4886
4887 SDLoc SL(N);
4888 ISD::CondCode NewCC =
4889 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4890
4891 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4892 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4893 }
4894
4895 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4897 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4898 // Revisit this node so we can catch min3/max3/med3 patterns.
4899 //DCI.AddToWorklist(MinMax.getNode());
4900 return MinMax;
4901 }
4902 }
4903
4904 // There's no reason to not do this if the condition has other uses.
4905 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4906}
4907
4908static bool isInv2Pi(const APFloat &APF) {
4909 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4910 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4911 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4912
4913 return APF.bitwiseIsEqual(KF16) ||
4914 APF.bitwiseIsEqual(KF32) ||
4915 APF.bitwiseIsEqual(KF64);
4916}
4917
4918// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4919// additional cost to negate them.
4922 if (C->isZero())
4923 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4924
4925 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4926 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4927
4929}
4930
4936
4942
4943static unsigned inverseMinMax(unsigned Opc) {
4944 switch (Opc) {
4945 case ISD::FMAXNUM:
4946 return ISD::FMINNUM;
4947 case ISD::FMINNUM:
4948 return ISD::FMAXNUM;
4949 case ISD::FMAXNUM_IEEE:
4950 return ISD::FMINNUM_IEEE;
4951 case ISD::FMINNUM_IEEE:
4952 return ISD::FMAXNUM_IEEE;
4953 case ISD::FMAXIMUM:
4954 return ISD::FMINIMUM;
4955 case ISD::FMINIMUM:
4956 return ISD::FMAXIMUM;
4957 case ISD::FMAXIMUMNUM:
4958 return ISD::FMINIMUMNUM;
4959 case ISD::FMINIMUMNUM:
4960 return ISD::FMAXIMUMNUM;
4961 case AMDGPUISD::FMAX_LEGACY:
4962 return AMDGPUISD::FMIN_LEGACY;
4963 case AMDGPUISD::FMIN_LEGACY:
4964 return AMDGPUISD::FMAX_LEGACY;
4965 default:
4966 llvm_unreachable("invalid min/max opcode");
4967 }
4968}
4969
4970/// \return true if it's profitable to try to push an fneg into its source
4971/// instruction.
4973 // If the input has multiple uses and we can either fold the negate down, or
4974 // the other uses cannot, give up. This both prevents unprofitable
4975 // transformations and infinite loops: we won't repeatedly try to fold around
4976 // a negate that has no 'good' form.
4977 if (N0.hasOneUse()) {
4978 // This may be able to fold into the source, but at a code size cost. Don't
4979 // fold if the fold into the user is free.
4980 if (allUsesHaveSourceMods(N, 0))
4981 return false;
4982 } else {
4983 if (fnegFoldsIntoOp(N0.getNode()) &&
4985 return false;
4986 }
4987
4988 return true;
4989}
4990
4992 DAGCombinerInfo &DCI) const {
4993 SelectionDAG &DAG = DCI.DAG;
4994 SDValue N0 = N->getOperand(0);
4995 EVT VT = N->getValueType(0);
4996
4997 unsigned Opc = N0.getOpcode();
4998
4999 if (!shouldFoldFNegIntoSrc(N, N0))
5000 return SDValue();
5001
5002 SDLoc SL(N);
5003 switch (Opc) {
5004 case ISD::FADD: {
5005 if (!mayIgnoreSignedZero(N0))
5006 return SDValue();
5007
5008 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5009 SDValue LHS = N0.getOperand(0);
5010 SDValue RHS = N0.getOperand(1);
5011
5012 if (LHS.getOpcode() != ISD::FNEG)
5013 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5014 else
5015 LHS = LHS.getOperand(0);
5016
5017 if (RHS.getOpcode() != ISD::FNEG)
5018 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5019 else
5020 RHS = RHS.getOperand(0);
5021
5022 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5023 if (Res.getOpcode() != ISD::FADD)
5024 return SDValue(); // Op got folded away.
5025 if (!N0.hasOneUse())
5026 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5027 return Res;
5028 }
5029 case ISD::FMUL:
5030 case AMDGPUISD::FMUL_LEGACY: {
5031 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5032 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5033 SDValue LHS = N0.getOperand(0);
5034 SDValue RHS = N0.getOperand(1);
5035
5036 if (LHS.getOpcode() == ISD::FNEG)
5037 LHS = LHS.getOperand(0);
5038 else if (RHS.getOpcode() == ISD::FNEG)
5039 RHS = RHS.getOperand(0);
5040 else
5041 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5042
5043 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5044 if (Res.getOpcode() != Opc)
5045 return SDValue(); // Op got folded away.
5046 if (!N0.hasOneUse())
5047 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5048 return Res;
5049 }
5050 case ISD::FMA:
5051 case ISD::FMAD: {
5052 // TODO: handle llvm.amdgcn.fma.legacy
5053 if (!mayIgnoreSignedZero(N0))
5054 return SDValue();
5055
5056 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5057 SDValue LHS = N0.getOperand(0);
5058 SDValue MHS = N0.getOperand(1);
5059 SDValue RHS = N0.getOperand(2);
5060
5061 if (LHS.getOpcode() == ISD::FNEG)
5062 LHS = LHS.getOperand(0);
5063 else if (MHS.getOpcode() == ISD::FNEG)
5064 MHS = MHS.getOperand(0);
5065 else
5066 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5067
5068 if (RHS.getOpcode() != ISD::FNEG)
5069 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5070 else
5071 RHS = RHS.getOperand(0);
5072
5073 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5074 if (Res.getOpcode() != Opc)
5075 return SDValue(); // Op got folded away.
5076 if (!N0.hasOneUse())
5077 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5078 return Res;
5079 }
5080 case ISD::FMAXNUM:
5081 case ISD::FMINNUM:
5082 case ISD::FMAXNUM_IEEE:
5083 case ISD::FMINNUM_IEEE:
5084 case ISD::FMINIMUM:
5085 case ISD::FMAXIMUM:
5086 case ISD::FMINIMUMNUM:
5087 case ISD::FMAXIMUMNUM:
5088 case AMDGPUISD::FMAX_LEGACY:
5089 case AMDGPUISD::FMIN_LEGACY: {
5090 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5091 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5092 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5093 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5094
5095 SDValue LHS = N0.getOperand(0);
5096 SDValue RHS = N0.getOperand(1);
5097
5098 // 0 doesn't have a negated inline immediate.
5099 // TODO: This constant check should be generalized to other operations.
5101 return SDValue();
5102
5103 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5104 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5105 unsigned Opposite = inverseMinMax(Opc);
5106
5107 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5108 if (Res.getOpcode() != Opposite)
5109 return SDValue(); // Op got folded away.
5110 if (!N0.hasOneUse())
5111 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5112 return Res;
5113 }
5114 case AMDGPUISD::FMED3: {
5115 SDValue Ops[3];
5116 for (unsigned I = 0; I < 3; ++I)
5117 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5118
5119 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5120 if (Res.getOpcode() != AMDGPUISD::FMED3)
5121 return SDValue(); // Op got folded away.
5122
5123 if (!N0.hasOneUse()) {
5124 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5125 DAG.ReplaceAllUsesWith(N0, Neg);
5126
5127 for (SDNode *U : Neg->users())
5128 DCI.AddToWorklist(U);
5129 }
5130
5131 return Res;
5132 }
5133 case ISD::FP_EXTEND:
5134 case ISD::FTRUNC:
5135 case ISD::FRINT:
5136 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5137 case ISD::FROUNDEVEN:
5138 case ISD::FSIN:
5139 case ISD::FCANONICALIZE:
5140 case AMDGPUISD::RCP:
5141 case AMDGPUISD::RCP_LEGACY:
5142 case AMDGPUISD::RCP_IFLAG:
5143 case AMDGPUISD::SIN_HW: {
5144 SDValue CvtSrc = N0.getOperand(0);
5145 if (CvtSrc.getOpcode() == ISD::FNEG) {
5146 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5147 // (fneg (rcp (fneg x))) -> (rcp x)
5148 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5149 }
5150
5151 if (!N0.hasOneUse())
5152 return SDValue();
5153
5154 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5155 // (fneg (rcp x)) -> (rcp (fneg x))
5156 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5157 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5158 }
5159 case ISD::FP_ROUND: {
5160 SDValue CvtSrc = N0.getOperand(0);
5161
5162 if (CvtSrc.getOpcode() == ISD::FNEG) {
5163 // (fneg (fp_round (fneg x))) -> (fp_round x)
5164 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5165 CvtSrc.getOperand(0), N0.getOperand(1));
5166 }
5167
5168 if (!N0.hasOneUse())
5169 return SDValue();
5170
5171 // (fneg (fp_round x)) -> (fp_round (fneg x))
5172 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5173 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5174 }
5175 case ISD::FP16_TO_FP: {
5176 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5177 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5178 // Put the fneg back as a legal source operation that can be matched later.
5179 SDLoc SL(N);
5180
5181 SDValue Src = N0.getOperand(0);
5182 EVT SrcVT = Src.getValueType();
5183
5184 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5185 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5186 DAG.getConstant(0x8000, SL, SrcVT));
5187 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5188 }
5189 case ISD::SELECT: {
5190 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5191 // TODO: Invert conditions of foldFreeOpFromSelect
5192 return SDValue();
5193 }
5194 case ISD::BITCAST: {
5195 SDLoc SL(N);
5196 SDValue BCSrc = N0.getOperand(0);
5197 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5198 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5199 if (HighBits.getValueType().getSizeInBits() != 32 ||
5200 !fnegFoldsIntoOp(HighBits.getNode()))
5201 return SDValue();
5202
5203 // f64 fneg only really needs to operate on the high half of of the
5204 // register, so try to force it to an f32 operation to help make use of
5205 // source modifiers.
5206 //
5207 //
5208 // fneg (f64 (bitcast (build_vector x, y))) ->
5209 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5210 // (fneg (bitcast i32:y to f32)))
5211
5212 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5213 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5214 SDValue CastBack =
5215 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5216
5218 Ops.back() = CastBack;
5219 DCI.AddToWorklist(NegHi.getNode());
5220 SDValue Build =
5221 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5222 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5223
5224 if (!N0.hasOneUse())
5225 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5226 return Result;
5227 }
5228
5229 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5230 BCSrc.hasOneUse()) {
5231 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5232 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5233
5234 // TODO: Cast back result for multiple uses is beneficial in some cases.
5235
5236 SDValue LHS =
5237 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5238 SDValue RHS =
5239 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5240
5241 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5242 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5243
5244 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5245 NegRHS);
5246 }
5247
5248 return SDValue();
5249 }
5250 default:
5251 return SDValue();
5252 }
5253}
5254
5256 DAGCombinerInfo &DCI) const {
5257 SelectionDAG &DAG = DCI.DAG;
5258 SDValue N0 = N->getOperand(0);
5259
5260 if (!N0.hasOneUse())
5261 return SDValue();
5262
5263 switch (N0.getOpcode()) {
5264 case ISD::FP16_TO_FP: {
5265 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5266 SDLoc SL(N);
5267 SDValue Src = N0.getOperand(0);
5268 EVT SrcVT = Src.getValueType();
5269
5270 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5271 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5272 DAG.getConstant(0x7fff, SL, SrcVT));
5273 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5274 }
5275 default:
5276 return SDValue();
5277 }
5278}
5279
5281 DAGCombinerInfo &DCI) const {
5282 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5283 if (!CFP)
5284 return SDValue();
5285
5286 // XXX - Should this flush denormals?
5287 const APFloat &Val = CFP->getValueAPF();
5288 APFloat One(Val.getSemantics(), "1.0");
5289 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5290}
5291
5293 DAGCombinerInfo &DCI) const {
5294 SelectionDAG &DAG = DCI.DAG;
5295 SDLoc DL(N);
5296
5297 switch(N->getOpcode()) {
5298 default:
5299 break;
5300 case ISD::BITCAST: {
5301 EVT DestVT = N->getValueType(0);
5302
5303 // Push casts through vector builds. This helps avoid emitting a large
5304 // number of copies when materializing floating point vector constants.
5305 //
5306 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5307 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5308 if (DestVT.isVector()) {
5309 SDValue Src = N->getOperand(0);
5310 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5313 EVT SrcVT = Src.getValueType();
5314 unsigned NElts = DestVT.getVectorNumElements();
5315
5316 if (SrcVT.getVectorNumElements() == NElts) {
5317 EVT DestEltVT = DestVT.getVectorElementType();
5318
5319 SmallVector<SDValue, 8> CastedElts;
5320 SDLoc SL(N);
5321 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5322 SDValue Elt = Src.getOperand(I);
5323 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5324 }
5325
5326 return DAG.getBuildVector(DestVT, SL, CastedElts);
5327 }
5328 }
5329 }
5330
5331 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5332 break;
5333
5334 // Fold bitcasts of constants.
5335 //
5336 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5337 // TODO: Generalize and move to DAGCombiner
5338 SDValue Src = N->getOperand(0);
5340 SDLoc SL(N);
5341 uint64_t CVal = C->getZExtValue();
5342 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5343 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5344 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5345 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5346 }
5347
5349 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5350 SDLoc SL(N);
5351 uint64_t CVal = Val.getZExtValue();
5352 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5353 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5354 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5355
5356 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5357 }
5358
5359 break;
5360 }
5361 case ISD::SHL:
5362 case ISD::SRA:
5363 case ISD::SRL: {
5364 // Range metadata can be invalidated when loads are converted to legal types
5365 // (e.g. v2i64 -> v4i32).
5366 // Try to convert vector shl/sra/srl before type legalization so that range
5367 // metadata can be utilized.
5368 if (!(N->getValueType(0).isVector() &&
5371 break;
5372 if (N->getOpcode() == ISD::SHL)
5373 return performShlCombine(N, DCI);
5374 if (N->getOpcode() == ISD::SRA)
5375 return performSraCombine(N, DCI);
5376 return performSrlCombine(N, DCI);
5377 }
5378 case ISD::TRUNCATE:
5379 return performTruncateCombine(N, DCI);
5380 case ISD::MUL:
5381 return performMulCombine(N, DCI);
5382 case AMDGPUISD::MUL_U24:
5383 case AMDGPUISD::MUL_I24: {
5384 if (SDValue Simplified = simplifyMul24(N, DCI))
5385 return Simplified;
5386 break;
5387 }
5388 case AMDGPUISD::MULHI_I24:
5389 case AMDGPUISD::MULHI_U24:
5390 return simplifyMul24(N, DCI);
5391 case ISD::SMUL_LOHI:
5392 case ISD::UMUL_LOHI:
5393 return performMulLoHiCombine(N, DCI);
5394 case ISD::MULHS:
5395 return performMulhsCombine(N, DCI);
5396 case ISD::MULHU:
5397 return performMulhuCombine(N, DCI);
5398 case ISD::SELECT:
5399 return performSelectCombine(N, DCI);
5400 case ISD::FNEG:
5401 return performFNegCombine(N, DCI);
5402 case ISD::FABS:
5403 return performFAbsCombine(N, DCI);
5404 case AMDGPUISD::BFE_I32:
5405 case AMDGPUISD::BFE_U32: {
5406 assert(!N->getValueType(0).isVector() &&
5407 "Vector handling of BFE not implemented");
5408 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5409 if (!Width)
5410 break;
5411
5412 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5413 if (WidthVal == 0)
5414 return DAG.getConstant(0, DL, MVT::i32);
5415
5417 if (!Offset)
5418 break;
5419
5420 SDValue BitsFrom = N->getOperand(0);
5421 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5422
5423 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5424
5425 if (OffsetVal == 0) {
5426 // This is already sign / zero extended, so try to fold away extra BFEs.
5427 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5428
5429 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5430 if (OpSignBits >= SignBits)
5431 return BitsFrom;
5432
5433 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5434 if (Signed) {
5435 // This is a sign_extend_inreg. Replace it to take advantage of existing
5436 // DAG Combines. If not eliminated, we will match back to BFE during
5437 // selection.
5438
5439 // TODO: The sext_inreg of extended types ends, although we can could
5440 // handle them in a single BFE.
5441 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5442 DAG.getValueType(SmallVT));
5443 }
5444
5445 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5446 }
5447
5448 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5449 if (Signed) {
5450 return constantFoldBFE<int32_t>(DAG,
5451 CVal->getSExtValue(),
5452 OffsetVal,
5453 WidthVal,
5454 DL);
5455 }
5456
5457 return constantFoldBFE<uint32_t>(DAG,
5458 CVal->getZExtValue(),
5459 OffsetVal,
5460 WidthVal,
5461 DL);
5462 }
5463
5464 if ((OffsetVal + WidthVal) >= 32 &&
5465 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5466 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5467 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5468 BitsFrom, ShiftVal);
5469 }
5470
5471 if (BitsFrom.hasOneUse()) {
5472 APInt Demanded = APInt::getBitsSet(32,
5473 OffsetVal,
5474 OffsetVal + WidthVal);
5475
5476 KnownBits Known;
5478 !DCI.isBeforeLegalizeOps());
5479 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5480 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5481 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5482 DCI.CommitTargetLoweringOpt(TLO);
5483 }
5484 }
5485
5486 break;
5487 }
5488 case ISD::LOAD:
5489 return performLoadCombine(N, DCI);
5490 case ISD::STORE:
5491 return performStoreCombine(N, DCI);
5492 case AMDGPUISD::RCP:
5493 case AMDGPUISD::RCP_IFLAG:
5494 return performRcpCombine(N, DCI);
5495 case ISD::AssertZext:
5496 case ISD::AssertSext:
5497 return performAssertSZExtCombine(N, DCI);
5499 return performIntrinsicWOChainCombine(N, DCI);
5500 case AMDGPUISD::FMAD_FTZ: {
5501 SDValue N0 = N->getOperand(0);
5502 SDValue N1 = N->getOperand(1);
5503 SDValue N2 = N->getOperand(2);
5504 EVT VT = N->getValueType(0);
5505
5506 // FMAD_FTZ is a FMAD + flush denormals to zero.
5507 // We flush the inputs, the intermediate step, and the output.
5511 if (N0CFP && N1CFP && N2CFP) {
5512 const auto FTZ = [](const APFloat &V) {
5513 if (V.isDenormal()) {
5514 APFloat Zero(V.getSemantics(), 0);
5515 return V.isNegative() ? -Zero : Zero;
5516 }
5517 return V;
5518 };
5519
5520 APFloat V0 = FTZ(N0CFP->getValueAPF());
5521 APFloat V1 = FTZ(N1CFP->getValueAPF());
5522 APFloat V2 = FTZ(N2CFP->getValueAPF());
5524 V0 = FTZ(V0);
5526 return DAG.getConstantFP(FTZ(V0), DL, VT);
5527 }
5528 break;
5529 }
5530 }
5531 return SDValue();
5532}
5533
5534//===----------------------------------------------------------------------===//
5535// Helper functions
5536//===----------------------------------------------------------------------===//
5537
5539 const TargetRegisterClass *RC,
5540 Register Reg, EVT VT,
5541 const SDLoc &SL,
5542 bool RawReg) const {
5545 Register VReg;
5546
5547 if (!MRI.isLiveIn(Reg)) {
5548 VReg = MRI.createVirtualRegister(RC);
5549 MRI.addLiveIn(Reg, VReg);
5550 } else {
5551 VReg = MRI.getLiveInVirtReg(Reg);
5552 }
5553
5554 if (RawReg)
5555 return DAG.getRegister(VReg, VT);
5556
5557 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5558}
5559
5560// This may be called multiple times, and nothing prevents creating multiple
5561// objects at the same offset. See if we already defined this object.
5563 int64_t Offset) {
5564 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5565 if (MFI.getObjectOffset(I) == Offset) {
5566 assert(MFI.getObjectSize(I) == Size);
5567 return I;
5568 }
5569 }
5570
5571 return MFI.CreateFixedObject(Size, Offset, true);
5572}
5573
5575 EVT VT,
5576 const SDLoc &SL,
5577 int64_t Offset) const {
5579 MachineFrameInfo &MFI = MF.getFrameInfo();
5580 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5581
5582 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5583 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5584
5585 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5588}
5589
5591 const SDLoc &SL,
5592 SDValue Chain,
5593 SDValue ArgVal,
5594 int64_t Offset) const {
5598
5599 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5600 // Stores to the argument stack area are relative to the stack pointer.
5601 SDValue SP =
5602 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5603 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5604 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5606 return Store;
5607}
5608
5610 const TargetRegisterClass *RC,
5611 EVT VT, const SDLoc &SL,
5612 const ArgDescriptor &Arg) const {
5613 assert(Arg && "Attempting to load missing argument");
5614
5615 SDValue V = Arg.isRegister() ?
5616 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5617 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5618
5619 if (!Arg.isMasked())
5620 return V;
5621
5622 unsigned Mask = Arg.getMask();
5623 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5624 V = DAG.getNode(ISD::SRL, SL, VT, V,
5625 DAG.getShiftAmountConstant(Shift, VT, SL));
5626 return DAG.getNode(ISD::AND, SL, VT, V,
5627 DAG.getConstant(Mask >> Shift, SL, VT));
5628}
5629
5631 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5632 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5633 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5634 uint64_t ArgOffset =
5635 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5636 switch (Param) {
5637 case FIRST_IMPLICIT:
5638 return ArgOffset;
5639 case PRIVATE_BASE:
5641 case SHARED_BASE:
5642 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5643 case QUEUE_PTR:
5644 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5645 }
5646 llvm_unreachable("unexpected implicit parameter type");
5647}
5648
5654
5656 SelectionDAG &DAG, int Enabled,
5657 int &RefinementSteps,
5658 bool &UseOneConstNR,
5659 bool Reciprocal) const {
5660 EVT VT = Operand.getValueType();
5661
5662 if (VT == MVT::f32) {
5663 RefinementSteps = 0;
5664 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5665 }
5666
5667 // TODO: There is also f64 rsq instruction, but the documentation is less
5668 // clear on its precision.
5669
5670 return SDValue();
5671}
5672
5674 SelectionDAG &DAG, int Enabled,
5675 int &RefinementSteps) const {
5676 EVT VT = Operand.getValueType();
5677
5678 if (VT == MVT::f32) {
5679 // Reciprocal, < 1 ulp error.
5680 //
5681 // This reciprocal approximation converges to < 0.5 ulp error with one
5682 // newton rhapson performed with two fused multiple adds (FMAs).
5683
5684 RefinementSteps = 0;
5685 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5686 }
5687
5688 // TODO: There is also f64 rcp instruction, but the documentation is less
5689 // clear on its precision.
5690
5691 return SDValue();
5692}
5693
5694static unsigned workitemIntrinsicDim(unsigned ID) {
5695 switch (ID) {
5696 case Intrinsic::amdgcn_workitem_id_x:
5697 return 0;
5698 case Intrinsic::amdgcn_workitem_id_y:
5699 return 1;
5700 case Intrinsic::amdgcn_workitem_id_z:
5701 return 2;
5702 default:
5703 llvm_unreachable("not a workitem intrinsic");
5704 }
5705}
5706
5708 const SDValue Op, KnownBits &Known,
5709 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5710
5711 Known.resetAll(); // Don't know anything.
5712
5713 unsigned Opc = Op.getOpcode();
5714
5715 switch (Opc) {
5716 default:
5717 break;
5718 case AMDGPUISD::CARRY:
5719 case AMDGPUISD::BORROW: {
5720 Known.Zero = APInt::getHighBitsSet(32, 31);
5721 break;
5722 }
5723
5724 case AMDGPUISD::BFE_I32:
5725 case AMDGPUISD::BFE_U32: {
5726 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5727 if (!CWidth)
5728 return;
5729
5730 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5731
5732 if (Opc == AMDGPUISD::BFE_U32)
5733 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5734
5735 break;
5736 }
5737 case AMDGPUISD::FP_TO_FP16: {
5738 unsigned BitWidth = Known.getBitWidth();
5739
5740 // High bits are zero.
5742 break;
5743 }
5744 case AMDGPUISD::MUL_U24:
5745 case AMDGPUISD::MUL_I24: {
5746 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5747 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5748 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5749 RHSKnown.countMinTrailingZeros();
5750 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5751 // Skip extra check if all bits are known zeros.
5752 if (TrailZ >= 32)
5753 break;
5754
5755 // Truncate to 24 bits.
5756 LHSKnown = LHSKnown.trunc(24);
5757 RHSKnown = RHSKnown.trunc(24);
5758
5759 if (Opc == AMDGPUISD::MUL_I24) {
5760 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5761 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5762 unsigned MaxValBits = LHSValBits + RHSValBits;
5763 if (MaxValBits > 32)
5764 break;
5765 unsigned SignBits = 32 - MaxValBits + 1;
5766 bool LHSNegative = LHSKnown.isNegative();
5767 bool LHSNonNegative = LHSKnown.isNonNegative();
5768 bool LHSPositive = LHSKnown.isStrictlyPositive();
5769 bool RHSNegative = RHSKnown.isNegative();
5770 bool RHSNonNegative = RHSKnown.isNonNegative();
5771 bool RHSPositive = RHSKnown.isStrictlyPositive();
5772
5773 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5774 Known.Zero.setHighBits(SignBits);
5775 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5776 Known.One.setHighBits(SignBits);
5777 } else {
5778 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5779 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5780 unsigned MaxValBits = LHSValBits + RHSValBits;
5781 if (MaxValBits >= 32)
5782 break;
5783 Known.Zero.setBitsFrom(MaxValBits);
5784 }
5785 break;
5786 }
5787 case AMDGPUISD::PERM: {
5788 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5789 if (!CMask)
5790 return;
5791
5792 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5793 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5794 unsigned Sel = CMask->getZExtValue();
5795
5796 for (unsigned I = 0; I < 32; I += 8) {
5797 unsigned SelBits = Sel & 0xff;
5798 if (SelBits < 4) {
5799 SelBits *= 8;
5800 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5801 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5802 } else if (SelBits < 7) {
5803 SelBits = (SelBits & 3) * 8;
5804 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5805 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5806 } else if (SelBits == 0x0c) {
5807 Known.Zero |= 0xFFull << I;
5808 } else if (SelBits > 0x0c) {
5809 Known.One |= 0xFFull << I;
5810 }
5811 Sel >>= 8;
5812 }
5813 break;
5814 }
5815 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5816 Known.Zero.setHighBits(24);
5817 break;
5818 }
5819 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5820 Known.Zero.setHighBits(16);
5821 break;
5822 }
5823 case AMDGPUISD::LDS: {
5824 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5825 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5826
5827 Known.Zero.setHighBits(16);
5828 Known.Zero.setLowBits(Log2(Alignment));
5829 break;
5830 }
5831 case AMDGPUISD::SMIN3:
5832 case AMDGPUISD::SMAX3:
5833 case AMDGPUISD::SMED3:
5834 case AMDGPUISD::UMIN3:
5835 case AMDGPUISD::UMAX3:
5836 case AMDGPUISD::UMED3: {
5837 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5838 if (Known2.isUnknown())
5839 break;
5840
5841 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5842 if (Known1.isUnknown())
5843 break;
5844
5845 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5846 if (Known0.isUnknown())
5847 break;
5848
5849 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5850 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5851 Known.One = Known0.One & Known1.One & Known2.One;
5852 break;
5853 }
5855 unsigned IID = Op.getConstantOperandVal(0);
5856 switch (IID) {
5857 case Intrinsic::amdgcn_workitem_id_x:
5858 case Intrinsic::amdgcn_workitem_id_y:
5859 case Intrinsic::amdgcn_workitem_id_z: {
5860 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5862 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5863 break;
5864 }
5865 default:
5866 break;
5867 }
5868 }
5869 }
5870}
5871
5873 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5874 unsigned Depth) const {
5875 switch (Op.getOpcode()) {
5876 case AMDGPUISD::BFE_I32: {
5877 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5878 if (!Width)
5879 return 1;
5880
5881 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5882 if (!isNullConstant(Op.getOperand(1)))
5883 return SignBits;
5884
5885 // TODO: Could probably figure something out with non-0 offsets.
5886 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5887 return std::max(SignBits, Op0SignBits);
5888 }
5889
5890 case AMDGPUISD::BFE_U32: {
5891 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5892 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5893 }
5894
5895 case AMDGPUISD::CARRY:
5896 case AMDGPUISD::BORROW:
5897 return 31;
5898 case AMDGPUISD::BUFFER_LOAD_BYTE:
5899 return 25;
5900 case AMDGPUISD::BUFFER_LOAD_SHORT:
5901 return 17;
5902 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5903 return 24;
5904 case AMDGPUISD::BUFFER_LOAD_USHORT:
5905 return 16;
5906 case AMDGPUISD::FP_TO_FP16:
5907 return 16;
5908 case AMDGPUISD::SMIN3:
5909 case AMDGPUISD::SMAX3:
5910 case AMDGPUISD::SMED3:
5911 case AMDGPUISD::UMIN3:
5912 case AMDGPUISD::UMAX3:
5913 case AMDGPUISD::UMED3: {
5914 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5915 if (Tmp2 == 1)
5916 return 1; // Early out.
5917
5918 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5919 if (Tmp1 == 1)
5920 return 1; // Early out.
5921
5922 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5923 if (Tmp0 == 1)
5924 return 1; // Early out.
5925
5926 return std::min({Tmp0, Tmp1, Tmp2});
5927 }
5928 default:
5929 return 1;
5930 }
5931}
5932
5934 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
5935 const MachineRegisterInfo &MRI, unsigned Depth) const {
5936 const MachineInstr *MI = MRI.getVRegDef(R);
5937 if (!MI)
5938 return 1;
5939
5940 // TODO: Check range metadata on MMO.
5941 switch (MI->getOpcode()) {
5942 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5943 return 25;
5944 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5945 return 17;
5946 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5947 return 24;
5948 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5949 return 16;
5950 case AMDGPU::G_AMDGPU_SMED3:
5951 case AMDGPU::G_AMDGPU_UMED3: {
5952 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5953 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5954 if (Tmp2 == 1)
5955 return 1;
5956 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5957 if (Tmp1 == 1)
5958 return 1;
5959 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5960 if (Tmp0 == 1)
5961 return 1;
5962 return std::min({Tmp0, Tmp1, Tmp2});
5963 }
5964 default:
5965 return 1;
5966 }
5967}
5968
5970 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5971 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
5972 unsigned Opcode = Op.getOpcode();
5973 switch (Opcode) {
5974 case AMDGPUISD::BFE_I32:
5975 case AMDGPUISD::BFE_U32:
5976 return false;
5977 }
5979 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
5980}
5981
5983 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
5984 unsigned Depth) const {
5985 unsigned Opcode = Op.getOpcode();
5986 switch (Opcode) {
5987 case AMDGPUISD::FMIN_LEGACY:
5988 case AMDGPUISD::FMAX_LEGACY: {
5989 if (SNaN)
5990 return true;
5991
5992 // TODO: Can check no nans on one of the operands for each one, but which
5993 // one?
5994 return false;
5995 }
5996 case AMDGPUISD::FMUL_LEGACY:
5997 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
5998 if (SNaN)
5999 return true;
6000 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6001 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6002 }
6003 case AMDGPUISD::FMED3:
6004 case AMDGPUISD::FMIN3:
6005 case AMDGPUISD::FMAX3:
6006 case AMDGPUISD::FMINIMUM3:
6007 case AMDGPUISD::FMAXIMUM3:
6008 case AMDGPUISD::FMAD_FTZ: {
6009 if (SNaN)
6010 return true;
6011 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6012 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6013 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6014 }
6015 case AMDGPUISD::CVT_F32_UBYTE0:
6016 case AMDGPUISD::CVT_F32_UBYTE1:
6017 case AMDGPUISD::CVT_F32_UBYTE2:
6018 case AMDGPUISD::CVT_F32_UBYTE3:
6019 return true;
6020
6021 case AMDGPUISD::RCP:
6022 case AMDGPUISD::RSQ:
6023 case AMDGPUISD::RCP_LEGACY:
6024 case AMDGPUISD::RSQ_CLAMP: {
6025 if (SNaN)
6026 return true;
6027
6028 // TODO: Need is known positive check.
6029 return false;
6030 }
6031 case ISD::FLDEXP:
6032 case AMDGPUISD::FRACT: {
6033 if (SNaN)
6034 return true;
6035 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6036 }
6037 case AMDGPUISD::DIV_SCALE:
6038 case AMDGPUISD::DIV_FMAS:
6039 case AMDGPUISD::DIV_FIXUP:
6040 // TODO: Refine on operands.
6041 return SNaN;
6042 case AMDGPUISD::SIN_HW:
6043 case AMDGPUISD::COS_HW: {
6044 // TODO: Need check for infinity
6045 return SNaN;
6046 }
6048 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6049 // TODO: Handle more intrinsics
6050 switch (IntrinsicID) {
6051 case Intrinsic::amdgcn_cubeid:
6052 case Intrinsic::amdgcn_cvt_off_f32_i4:
6053 return true;
6054
6055 case Intrinsic::amdgcn_frexp_mant: {
6056 if (SNaN)
6057 return true;
6058 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6059 }
6060 case Intrinsic::amdgcn_cvt_pkrtz: {
6061 if (SNaN)
6062 return true;
6063 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6064 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6065 }
6066 case Intrinsic::amdgcn_rcp:
6067 case Intrinsic::amdgcn_rsq:
6068 case Intrinsic::amdgcn_rcp_legacy:
6069 case Intrinsic::amdgcn_rsq_legacy:
6070 case Intrinsic::amdgcn_rsq_clamp:
6071 case Intrinsic::amdgcn_tanh: {
6072 if (SNaN)
6073 return true;
6074
6075 // TODO: Need is known positive check.
6076 return false;
6077 }
6078 case Intrinsic::amdgcn_trig_preop:
6079 case Intrinsic::amdgcn_fdot2:
6080 // TODO: Refine on operand
6081 return SNaN;
6082 case Intrinsic::amdgcn_fma_legacy:
6083 if (SNaN)
6084 return true;
6085 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6086 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6087 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6088 default:
6089 return false;
6090 }
6091 }
6092 default:
6093 return false;
6094 }
6095}
6096
6098 Register N0, Register N1) const {
6099 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6100}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1396
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1163
const fltSemantics & getSemantics() const
Definition APFloat.h:1439
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1181
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1151
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Type * getValueType() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1551
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:269
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...