LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
77 setOperationAction(ISD::LOAD, MVT::f32, Promote);
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
80 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
83 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
86 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
89 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
92 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
95 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
98 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
101 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
119 setOperationAction(ISD::LOAD, MVT::i64, Promote);
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
122 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
125 setOperationAction(ISD::LOAD, MVT::f64, Promote);
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
128 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
131 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
134 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
137 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
140 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
143 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
146 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
155 setOperationAction(ISD::LOAD, MVT::i128, Promote);
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
159 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
162 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
165 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
168 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
171 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
174 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
177 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
180 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
242 setOperationAction(ISD::STORE, MVT::f32, Promote);
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
245 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
248 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
251 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
254 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
257 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
260 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
263 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
266 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
269 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
272 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
275 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
278 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
281 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
284 setOperationAction(ISD::STORE, MVT::i64, Promote);
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
287 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
290 setOperationAction(ISD::STORE, MVT::f64, Promote);
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
293 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
296 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
299 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
302 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
305 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
308 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
311 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
314 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
317 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
320 setOperationAction(ISD::STORE, MVT::i128, Promote);
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
343 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
345 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
347 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
348
349 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
351 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
352
353 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
355 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
356
357 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
358
359 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
362 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
365 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
366
367 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
368 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
371 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
372
373 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
375 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
376
377 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
379 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
380
381 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
383 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
384
385 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
387 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
388
389 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
391 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
395 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
396
397 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
398 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
399
400 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
404 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
408 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
409 ISD::FROUNDEVEN, ISD::FTRUNC},
410 {MVT::f16, MVT::f32}, Legal);
411 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
412
413 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
414 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
415 setOperationAction({ISD::LROUND, ISD::LLROUND},
416 {MVT::f16, MVT::f32, MVT::f64}, Expand);
417
419 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
420 Custom);
421
422 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
423
424 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
425
426 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
427 Expand);
428
429 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
430
431 if (Subtarget->has16BitInsts()) {
432 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
433 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
434 } else {
435 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
436 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
437 }
438
439 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
440 Custom);
441
442 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
443 if (Subtarget->has16BitInsts()) {
445 }
446
447 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
448 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
449 // default unless marked custom/legal.
451 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
452 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
453 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
454 MVT::v16f64},
455 Custom);
456
457 if (isTypeLegal(MVT::f16))
459 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
460 Custom);
461
462 // Expand to fneg + fadd.
464
466 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
467 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
468 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
469 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
470 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
471 Custom);
472
475 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
476 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
477 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
478 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
479 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
480 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
481 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
482 Custom);
483
484 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
485 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
486
487 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
488 for (MVT VT : ScalarIntVTs) {
489 // These should use [SU]DIVREM, so set them to expand
491 Expand);
492
493 // GPU does not have divrem function for signed or unsigned.
495
496 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
498
500
501 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
503 }
504
505 // The hardware supports 32-bit FSHR, but not FSHL.
507
508 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
509
511
515 MVT::i64, Custom);
517
519 Legal);
520
523 MVT::i64, Custom);
524
525 for (auto VT : {MVT::i8, MVT::i16})
527
528 static const MVT::SimpleValueType VectorIntTypes[] = {
529 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
530 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
531
532 for (MVT VT : VectorIntTypes) {
533 // Expand the following operations for the current type by default.
545 ISD::SETCC, ISD::ADDRSPACECAST},
546 VT, Expand);
547 }
548
549 static const MVT::SimpleValueType FloatVectorTypes[] = {
550 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
551 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
552
553 for (MVT VT : FloatVectorTypes) {
555 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
556 ISD::FADD, ISD::FCEIL, ISD::FCOS,
557 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
558 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
559 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
560 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
561 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
562 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
563 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
565 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
566 VT, Expand);
567 }
568
569 // This causes using an unrolled select operation rather than expansion with
570 // bit operations. This is in general better, but the alternative using BFI
571 // instructions may be better if the select sources are SGPRs.
573 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
574
576 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
577
579 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
580
582 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
583
585 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
586
588 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
589
591 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
592
594 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
595
597 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
598
600 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
601
603 setJumpIsExpensive(true);
604
607
609
610 // We want to find all load dependencies for long chains of stores to enable
611 // merging into very wide vectors. The problem is with vectors with > 4
612 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
613 // vectors are a legal type, even though we have to split the loads
614 // usually. When we can more precisely specify load legality per address
615 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
616 // smarter so that they can figure out what to do in 2 iterations without all
617 // N > 4 stores on the same chain.
619
620 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
621 // about these during lowering.
622 MaxStoresPerMemcpy = 0xffffffff;
623 MaxStoresPerMemmove = 0xffffffff;
624 MaxStoresPerMemset = 0xffffffff;
625
626 // The expansion for 64-bit division is enormous.
628 addBypassSlowDiv(64, 32);
629
630 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
636 ISD::STORE, ISD::FADD,
637 ISD::FSUB, ISD::FNEG,
638 ISD::FABS, ISD::AssertZext,
640
644}
645
647 if (getTargetMachine().Options.NoSignedZerosFPMath)
648 return true;
649
650 const auto Flags = Op.getNode()->getFlags();
651 if (Flags.hasNoSignedZeros())
652 return true;
653
654 return false;
655}
656
657//===----------------------------------------------------------------------===//
658// Target Information
659//===----------------------------------------------------------------------===//
660
662static bool fnegFoldsIntoOpcode(unsigned Opc) {
663 switch (Opc) {
664 case ISD::FADD:
665 case ISD::FSUB:
666 case ISD::FMUL:
667 case ISD::FMA:
668 case ISD::FMAD:
669 case ISD::FMINNUM:
670 case ISD::FMAXNUM:
671 case ISD::FMINNUM_IEEE:
672 case ISD::FMAXNUM_IEEE:
673 case ISD::FMINIMUM:
674 case ISD::FMAXIMUM:
675 case ISD::FMINIMUMNUM:
676 case ISD::FMAXIMUMNUM:
677 case ISD::SELECT:
678 case ISD::FSIN:
679 case ISD::FTRUNC:
680 case ISD::FRINT:
681 case ISD::FNEARBYINT:
682 case ISD::FROUNDEVEN:
684 case AMDGPUISD::RCP:
685 case AMDGPUISD::RCP_LEGACY:
686 case AMDGPUISD::RCP_IFLAG:
687 case AMDGPUISD::SIN_HW:
688 case AMDGPUISD::FMUL_LEGACY:
689 case AMDGPUISD::FMIN_LEGACY:
690 case AMDGPUISD::FMAX_LEGACY:
691 case AMDGPUISD::FMED3:
692 // TODO: handle llvm.amdgcn.fma.legacy
693 return true;
694 case ISD::BITCAST:
695 llvm_unreachable("bitcast is special cased");
696 default:
697 return false;
698 }
699}
700
701static bool fnegFoldsIntoOp(const SDNode *N) {
702 unsigned Opc = N->getOpcode();
703 if (Opc == ISD::BITCAST) {
704 // TODO: Is there a benefit to checking the conditions performFNegCombine
705 // does? We don't for the other cases.
706 SDValue BCSrc = N->getOperand(0);
707 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
708 return BCSrc.getNumOperands() == 2 &&
709 BCSrc.getOperand(1).getValueSizeInBits() == 32;
710 }
711
712 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
713 }
714
715 return fnegFoldsIntoOpcode(Opc);
716}
717
718/// \p returns true if the operation will definitely need to use a 64-bit
719/// encoding, and thus will use a VOP3 encoding regardless of the source
720/// modifiers.
722static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
723 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
724 VT == MVT::f64;
725}
726
727/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
728/// type for ISD::SELECT.
730static bool selectSupportsSourceMods(const SDNode *N) {
731 // TODO: Only applies if select will be vector
732 return N->getValueType(0) == MVT::f32;
733}
734
735// Most FP instructions support source modifiers, but this could be refined
736// slightly.
738static bool hasSourceMods(const SDNode *N) {
739 if (isa<MemSDNode>(N))
740 return false;
741
742 switch (N->getOpcode()) {
743 case ISD::CopyToReg:
744 case ISD::FDIV:
745 case ISD::FREM:
746 case ISD::INLINEASM:
747 case ISD::INLINEASM_BR:
748 case AMDGPUISD::DIV_SCALE:
750
751 // TODO: Should really be looking at the users of the bitcast. These are
752 // problematic because bitcasts are used to legalize all stores to integer
753 // types.
754 case ISD::BITCAST:
755 return false;
757 switch (N->getConstantOperandVal(0)) {
758 case Intrinsic::amdgcn_interp_p1:
759 case Intrinsic::amdgcn_interp_p2:
760 case Intrinsic::amdgcn_interp_mov:
761 case Intrinsic::amdgcn_interp_p1_f16:
762 case Intrinsic::amdgcn_interp_p2_f16:
763 return false;
764 default:
765 return true;
766 }
767 }
768 case ISD::SELECT:
770 default:
771 return true;
772 }
773}
774
776 unsigned CostThreshold) {
777 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
778 // it is truly free to use a source modifier in all cases. If there are
779 // multiple users but for each one will necessitate using VOP3, there will be
780 // a code size increase. Try to avoid increasing code size unless we know it
781 // will save on the instruction count.
782 unsigned NumMayIncreaseSize = 0;
783 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
784
785 assert(!N->use_empty());
786
787 // XXX - Should this limit number of uses to check?
788 for (const SDNode *U : N->users()) {
789 if (!hasSourceMods(U))
790 return false;
791
792 if (!opMustUseVOP3Encoding(U, VT)) {
793 if (++NumMayIncreaseSize > CostThreshold)
794 return false;
795 }
796 }
797
798 return true;
799}
800
802 ISD::NodeType ExtendKind) const {
803 assert(!VT.isVector() && "only scalar expected");
804
805 // Round to the next multiple of 32-bits.
806 unsigned Size = VT.getSizeInBits();
807 if (Size <= 32)
808 return MVT::i32;
809 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
810}
811
813 return 32;
814}
815
817 return true;
818}
819
820// The backend supports 32 and 64 bit floating point immediates.
821// FIXME: Why are we reporting vectors of FP immediates as legal?
823 bool ForCodeSize) const {
824 EVT ScalarVT = VT.getScalarType();
825 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
826 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
827}
828
829// We don't want to shrink f64 / f32 constants.
831 EVT ScalarVT = VT.getScalarType();
832 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
833}
834
836 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
837 std::optional<unsigned> ByteOffset) const {
838 // TODO: This may be worth removing. Check regression tests for diffs.
839 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
840 return false;
841
842 unsigned NewSize = NewVT.getStoreSizeInBits();
843
844 // If we are reducing to a 32-bit load or a smaller multi-dword load,
845 // this is always better.
846 if (NewSize >= 32)
847 return true;
848
849 EVT OldVT = N->getValueType(0);
850 unsigned OldSize = OldVT.getStoreSizeInBits();
851
853 unsigned AS = MN->getAddressSpace();
854 // Do not shrink an aligned scalar load to sub-dword.
855 // Scalar engine cannot do sub-dword loads.
856 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
857 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
861 MN->isInvariant())) &&
863 return false;
864
865 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
866 // extloads, so doing one requires using a buffer_load. In cases where we
867 // still couldn't use a scalar load, using the wider load shouldn't really
868 // hurt anything.
869
870 // If the old size already had to be an extload, there's no harm in continuing
871 // to reduce the width.
872 return (OldSize < 32);
873}
874
876 const SelectionDAG &DAG,
877 const MachineMemOperand &MMO) const {
878
879 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
880
881 if (LoadTy.getScalarType() == MVT::i32)
882 return false;
883
884 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
885 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
886
887 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
888 return false;
889
890 unsigned Fast = 0;
892 CastTy, MMO, &Fast) &&
893 Fast;
894}
895
896// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
897// profitable with the expansion for 64-bit since it's generally good to
898// speculate things.
900 return true;
901}
902
904 return true;
905}
906
908 switch (N->getOpcode()) {
909 case ISD::EntryToken:
910 case ISD::TokenFactor:
911 return true;
913 unsigned IntrID = N->getConstantOperandVal(0);
915 }
917 unsigned IntrID = N->getConstantOperandVal(1);
919 }
920 case ISD::LOAD:
921 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
923 return true;
924 return false;
925 case AMDGPUISD::SETCC: // ballot-style instruction
926 return true;
927 }
928 return false;
929}
930
932 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
933 NegatibleCost &Cost, unsigned Depth) const {
934
935 switch (Op.getOpcode()) {
936 case ISD::FMA:
937 case ISD::FMAD: {
938 // Negating a fma is not free if it has users without source mods.
939 if (!allUsesHaveSourceMods(Op.getNode()))
940 return SDValue();
941 break;
942 }
943 case AMDGPUISD::RCP: {
944 SDValue Src = Op.getOperand(0);
945 EVT VT = Op.getValueType();
946 SDLoc SL(Op);
947
948 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
949 ForCodeSize, Cost, Depth + 1);
950 if (NegSrc)
951 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
952 return SDValue();
953 }
954 default:
955 break;
956 }
957
958 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
959 ForCodeSize, Cost, Depth);
960}
961
962//===---------------------------------------------------------------------===//
963// Target Properties
964//===---------------------------------------------------------------------===//
965
968
969 // Packed operations do not have a fabs modifier.
970 return VT == MVT::f32 || VT == MVT::f64 ||
971 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
972}
973
976 // Report this based on the end legalized type.
977 VT = VT.getScalarType();
978 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
979}
980
982 unsigned NumElem,
983 unsigned AS) const {
984 return true;
985}
986
988 // There are few operations which truly have vector input operands. Any vector
989 // operation is going to involve operations on each component, and a
990 // build_vector will be a copy per element, so it always makes sense to use a
991 // build_vector input in place of the extracted element to avoid a copy into a
992 // super register.
993 //
994 // We should probably only do this if all users are extracts only, but this
995 // should be the common case.
996 return true;
997}
998
1000 // Truncate is just accessing a subregister.
1001
1002 unsigned SrcSize = Source.getSizeInBits();
1003 unsigned DestSize = Dest.getSizeInBits();
1004
1005 return DestSize < SrcSize && DestSize % 32 == 0 ;
1006}
1007
1009 // Truncate is just accessing a subregister.
1010
1011 unsigned SrcSize = Source->getScalarSizeInBits();
1012 unsigned DestSize = Dest->getScalarSizeInBits();
1013
1014 if (DestSize== 16 && Subtarget->has16BitInsts())
1015 return SrcSize >= 32;
1016
1017 return DestSize < SrcSize && DestSize % 32 == 0;
1018}
1019
1021 unsigned SrcSize = Src->getScalarSizeInBits();
1022 unsigned DestSize = Dest->getScalarSizeInBits();
1023
1024 if (SrcSize == 16 && Subtarget->has16BitInsts())
1025 return DestSize >= 32;
1026
1027 return SrcSize == 32 && DestSize == 64;
1028}
1029
1031 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1032 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1033 // this will enable reducing 64-bit operations the 32-bit, which is always
1034 // good.
1035
1036 if (Src == MVT::i16)
1037 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1038
1039 return Src == MVT::i32 && Dest == MVT::i64;
1040}
1041
1043 EVT DestVT) const {
1044 switch (N->getOpcode()) {
1045 case ISD::ADD:
1046 case ISD::SUB:
1047 case ISD::SHL:
1048 case ISD::SRL:
1049 case ISD::SRA:
1050 case ISD::AND:
1051 case ISD::OR:
1052 case ISD::XOR:
1053 case ISD::MUL:
1054 case ISD::SETCC:
1055 case ISD::SELECT:
1056 case ISD::SMIN:
1057 case ISD::SMAX:
1058 case ISD::UMIN:
1059 case ISD::UMAX:
1060 if (Subtarget->has16BitInsts() &&
1061 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1062 // Don't narrow back down to i16 if promoted to i32 already.
1063 if (!N->isDivergent() && DestVT.isInteger() &&
1064 DestVT.getScalarSizeInBits() > 1 &&
1065 DestVT.getScalarSizeInBits() <= 16 &&
1066 SrcVT.getScalarSizeInBits() > 16) {
1067 return false;
1068 }
1069 }
1070 return true;
1071 default:
1072 break;
1073 }
1074
1075 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1076 // limited number of native 64-bit operations. Shrinking an operation to fit
1077 // in a single 32-bit register should always be helpful. As currently used,
1078 // this is much less general than the name suggests, and is only used in
1079 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1080 // not profitable, and may actually be harmful.
1081 if (isa<LoadSDNode>(N))
1082 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1083
1084 return true;
1085}
1086
1088 const SDNode* N, CombineLevel Level) const {
1089 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1090 N->getOpcode() == ISD::SRL) &&
1091 "Expected shift op");
1092
1093 SDValue ShiftLHS = N->getOperand(0);
1094 if (!ShiftLHS->hasOneUse())
1095 return false;
1096
1097 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1098 !ShiftLHS.getOperand(0)->hasOneUse())
1099 return false;
1100
1101 // Always commute pre-type legalization and right shifts.
1102 // We're looking for shl(or(x,y),z) patterns.
1104 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1105 return true;
1106
1107 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1108 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1109 (N->user_begin()->getOpcode() == ISD::SRA ||
1110 N->user_begin()->getOpcode() == ISD::SRL))
1111 return false;
1112
1113 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1114 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1115 if (LHS.getOpcode() != ISD::SHL)
1116 return false;
1117 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1118 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1119 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1120 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1121 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1122 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1123 };
1124 SDValue LHS = N->getOperand(0).getOperand(0);
1125 SDValue RHS = N->getOperand(0).getOperand(1);
1126 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1127}
1128
1129//===---------------------------------------------------------------------===//
1130// TargetLowering Callbacks
1131//===---------------------------------------------------------------------===//
1132
1134 bool IsVarArg) {
1135 switch (CC) {
1143 return CC_AMDGPU;
1146 return CC_AMDGPU_CS_CHAIN;
1147 case CallingConv::C:
1148 case CallingConv::Fast:
1149 case CallingConv::Cold:
1150 return CC_AMDGPU_Func;
1153 return CC_SI_Gfx;
1156 default:
1157 reportFatalUsageError("unsupported calling convention for call");
1158 }
1159}
1160
1162 bool IsVarArg) {
1163 switch (CC) {
1166 llvm_unreachable("kernels should not be handled here");
1176 return RetCC_SI_Shader;
1179 return RetCC_SI_Gfx;
1180 case CallingConv::C:
1181 case CallingConv::Fast:
1182 case CallingConv::Cold:
1183 return RetCC_AMDGPU_Func;
1184 default:
1185 reportFatalUsageError("unsupported calling convention");
1186 }
1187}
1188
1189/// The SelectionDAGBuilder will automatically promote function arguments
1190/// with illegal types. However, this does not work for the AMDGPU targets
1191/// since the function arguments are stored in memory as these illegal types.
1192/// In order to handle this properly we need to get the original types sizes
1193/// from the LLVM IR Function and fixup the ISD:InputArg values before
1194/// passing them to AnalyzeFormalArguments()
1195
1196/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1197/// input values across multiple registers. Each item in the Ins array
1198/// represents a single value that will be stored in registers. Ins[x].VT is
1199/// the value type of the value that will be stored in the register, so
1200/// whatever SDNode we lower the argument to needs to be this type.
1201///
1202/// In order to correctly lower the arguments we need to know the size of each
1203/// argument. Since Ins[x].VT gives us the size of the register that will
1204/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1205/// for the original function argument so that we can deduce the correct memory
1206/// type to use for Ins[x]. In most cases the correct memory type will be
1207/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1208/// we have a kernel argument of type v8i8, this argument will be split into
1209/// 8 parts and each part will be represented by its own item in the Ins array.
1210/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1211/// the argument before it was split. From this, we deduce that the memory type
1212/// for each individual part is i8. We pass the memory type as LocVT to the
1213/// calling convention analysis function and the register type (Ins[x].VT) as
1214/// the ValVT.
1216 CCState &State,
1217 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1218 const MachineFunction &MF = State.getMachineFunction();
1219 const Function &Fn = MF.getFunction();
1220 LLVMContext &Ctx = Fn.getContext();
1221 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1222 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1224
1225 Align MaxAlign = Align(1);
1226 uint64_t ExplicitArgOffset = 0;
1227 const DataLayout &DL = Fn.getDataLayout();
1228
1229 unsigned InIndex = 0;
1230
1231 for (const Argument &Arg : Fn.args()) {
1232 const bool IsByRef = Arg.hasByRefAttr();
1233 Type *BaseArgTy = Arg.getType();
1234 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1235 Align Alignment = DL.getValueOrABITypeAlignment(
1236 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1237 MaxAlign = std::max(Alignment, MaxAlign);
1238 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1239
1240 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1241 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1242
1243 // We're basically throwing away everything passed into us and starting over
1244 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1245 // to us as computed in Ins.
1246 //
1247 // We also need to figure out what type legalization is trying to do to get
1248 // the correct memory offsets.
1249
1250 SmallVector<EVT, 16> ValueVTs;
1252 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1253 &Offsets, ArgOffset);
1254
1255 for (unsigned Value = 0, NumValues = ValueVTs.size();
1256 Value != NumValues; ++Value) {
1257 uint64_t BasePartOffset = Offsets[Value];
1258
1259 EVT ArgVT = ValueVTs[Value];
1260 EVT MemVT = ArgVT;
1261 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1262 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1263
1264 if (NumRegs == 1) {
1265 // This argument is not split, so the IR type is the memory type.
1266 if (ArgVT.isExtended()) {
1267 // We have an extended type, like i24, so we should just use the
1268 // register type.
1269 MemVT = RegisterVT;
1270 } else {
1271 MemVT = ArgVT;
1272 }
1273 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1274 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1275 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1276 // We have a vector value which has been split into a vector with
1277 // the same scalar type, but fewer elements. This should handle
1278 // all the floating-point vector types.
1279 MemVT = RegisterVT;
1280 } else if (ArgVT.isVector() &&
1281 ArgVT.getVectorNumElements() == NumRegs) {
1282 // This arg has been split so that each element is stored in a separate
1283 // register.
1284 MemVT = ArgVT.getScalarType();
1285 } else if (ArgVT.isExtended()) {
1286 // We have an extended type, like i65.
1287 MemVT = RegisterVT;
1288 } else {
1289 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1290 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1291 if (RegisterVT.isInteger()) {
1292 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1293 } else if (RegisterVT.isVector()) {
1294 assert(!RegisterVT.getScalarType().isFloatingPoint());
1295 unsigned NumElements = RegisterVT.getVectorNumElements();
1296 assert(MemoryBits % NumElements == 0);
1297 // This vector type has been split into another vector type with
1298 // a different elements size.
1299 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1300 MemoryBits / NumElements);
1301 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1302 } else {
1303 llvm_unreachable("cannot deduce memory type.");
1304 }
1305 }
1306
1307 // Convert one element vectors to scalar.
1308 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1309 MemVT = MemVT.getScalarType();
1310
1311 // Round up vec3/vec5 argument.
1312 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1313 MemVT = MemVT.getPow2VectorType(State.getContext());
1314 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1315 MemVT = MemVT.getRoundIntegerType(State.getContext());
1316 }
1317
1318 unsigned PartOffset = 0;
1319 for (unsigned i = 0; i != NumRegs; ++i) {
1320 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1321 BasePartOffset + PartOffset,
1322 MemVT.getSimpleVT(),
1324 PartOffset += MemVT.getStoreSize();
1325 }
1326 }
1327 }
1328}
1329
1331 SDValue Chain, CallingConv::ID CallConv,
1332 bool isVarArg,
1334 const SmallVectorImpl<SDValue> &OutVals,
1335 const SDLoc &DL, SelectionDAG &DAG) const {
1336 // FIXME: Fails for r600 tests
1337 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1338 // "wave terminate should not have return values");
1339 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1340}
1341
1342//===---------------------------------------------------------------------===//
1343// Target specific lowering
1344//===---------------------------------------------------------------------===//
1345
1346/// Selects the correct CCAssignFn for a given CallingConvention value.
1351
1356
1358 SelectionDAG &DAG,
1359 MachineFrameInfo &MFI,
1360 int ClobberedFI) const {
1361 SmallVector<SDValue, 8> ArgChains;
1362 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1363 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1364
1365 // Include the original chain at the beginning of the list. When this is
1366 // used by target LowerCall hooks, this helps legalize find the
1367 // CALLSEQ_BEGIN node.
1368 ArgChains.push_back(Chain);
1369
1370 // Add a chain value for each stack argument corresponding
1371 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1372 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1373 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1374 if (FI->getIndex() < 0) {
1375 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1376 int64_t InLastByte = InFirstByte;
1377 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1378
1379 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1380 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1381 ArgChains.push_back(SDValue(L, 1));
1382 }
1383 }
1384 }
1385 }
1386
1387 // Build a tokenfactor for all the chains.
1388 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1389}
1390
1393 StringRef Reason) const {
1394 SDValue Callee = CLI.Callee;
1395 SelectionDAG &DAG = CLI.DAG;
1396
1397 const Function &Fn = DAG.getMachineFunction().getFunction();
1398
1399 StringRef FuncName("<unknown>");
1400
1402 FuncName = G->getSymbol();
1403 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1404 FuncName = G->getGlobal()->getName();
1405
1406 DAG.getContext()->diagnose(
1407 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1408
1409 if (!CLI.IsTailCall) {
1410 for (ISD::InputArg &Arg : CLI.Ins)
1411 InVals.push_back(DAG.getPOISON(Arg.VT));
1412 }
1413
1414 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1415 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1416 return CLI.Chain;
1417
1418 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1419 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1420}
1421
1423 SmallVectorImpl<SDValue> &InVals) const {
1424 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1425}
1426
1428 SelectionDAG &DAG) const {
1429 const Function &Fn = DAG.getMachineFunction().getFunction();
1430
1432 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1433 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1434 return DAG.getMergeValues(Ops, SDLoc());
1435}
1436
1438 SelectionDAG &DAG) const {
1439 switch (Op.getOpcode()) {
1440 default:
1441 Op->print(errs(), &DAG);
1442 llvm_unreachable("Custom lowering code for this "
1443 "instruction is not implemented yet!");
1444 break;
1446 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1448 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1449 case ISD::SDIVREM:
1450 return LowerSDIVREM(Op, DAG);
1451 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1452 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1453 case ISD::FRINT: return LowerFRINT(Op, DAG);
1454 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1455 case ISD::FROUNDEVEN:
1456 return LowerFROUNDEVEN(Op, DAG);
1457 case ISD::FROUND: return LowerFROUND(Op, DAG);
1458 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1459 case ISD::FLOG2:
1460 return LowerFLOG2(Op, DAG);
1461 case ISD::FLOG:
1462 case ISD::FLOG10:
1463 return LowerFLOGCommon(Op, DAG);
1464 case ISD::FEXP:
1465 case ISD::FEXP10:
1466 return lowerFEXP(Op, DAG);
1467 case ISD::FEXP2:
1468 return lowerFEXP2(Op, DAG);
1469 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1470 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1471 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1472 case ISD::FP_TO_SINT:
1473 case ISD::FP_TO_UINT:
1474 return LowerFP_TO_INT(Op, DAG);
1475 case ISD::CTTZ:
1477 case ISD::CTLZ:
1479 return LowerCTLZ_CTTZ(Op, DAG);
1480 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1481 }
1482 return Op;
1483}
1484
1487 SelectionDAG &DAG) const {
1488 switch (N->getOpcode()) {
1490 // Different parts of legalization seem to interpret which type of
1491 // sign_extend_inreg is the one to check for custom lowering. The extended
1492 // from type is what really matters, but some places check for custom
1493 // lowering of the result type. This results in trying to use
1494 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1495 // nothing here and let the illegal result integer be handled normally.
1496 return;
1497 case ISD::FLOG2:
1498 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1499 Results.push_back(Lowered);
1500 return;
1501 case ISD::FLOG:
1502 case ISD::FLOG10:
1503 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1504 Results.push_back(Lowered);
1505 return;
1506 case ISD::FEXP2:
1507 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1508 Results.push_back(Lowered);
1509 return;
1510 case ISD::FEXP:
1511 case ISD::FEXP10:
1512 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1513 Results.push_back(Lowered);
1514 return;
1515 case ISD::CTLZ:
1517 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1518 Results.push_back(Lowered);
1519 return;
1520 default:
1521 return;
1522 }
1523}
1524
1526 SDValue Op,
1527 SelectionDAG &DAG) const {
1528
1529 const DataLayout &DL = DAG.getDataLayout();
1531 const GlobalValue *GV = G->getGlobal();
1532
1533 if (!MFI->isModuleEntryFunction()) {
1534 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1535 if (std::optional<uint32_t> Address =
1537 if (IsNamedBarrier) {
1538 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1539 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1540 }
1541 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1542 } else if (IsNamedBarrier) {
1543 llvm_unreachable("named barrier should have an assigned address");
1544 }
1545 }
1546
1547 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1548 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1549 if (!MFI->isModuleEntryFunction() &&
1550 GV->getName() != "llvm.amdgcn.module.lds" &&
1552 SDLoc DL(Op);
1553 const Function &Fn = DAG.getMachineFunction().getFunction();
1555 Fn, "local memory global used by non-kernel function",
1556 DL.getDebugLoc(), DS_Warning));
1557
1558 // We currently don't have a way to correctly allocate LDS objects that
1559 // aren't directly associated with a kernel. We do force inlining of
1560 // functions that use local objects. However, if these dead functions are
1561 // not eliminated, we don't want a compile time error. Just emit a warning
1562 // and a trap, since there should be no callable path here.
1563 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1564 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1565 Trap, DAG.getRoot());
1566 DAG.setRoot(OutputChain);
1567 return DAG.getPOISON(Op.getValueType());
1568 }
1569
1570 // XXX: What does the value of G->getOffset() mean?
1571 assert(G->getOffset() == 0 &&
1572 "Do not know what to do with an non-zero offset");
1573
1574 // TODO: We could emit code to handle the initialization somewhere.
1575 // We ignore the initializer for now and legalize it to allow selection.
1576 // The initializer will anyway get errored out during assembly emission.
1577 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1578 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1579 }
1580 return SDValue();
1581}
1582
1584 SelectionDAG &DAG) const {
1586 SDLoc SL(Op);
1587
1588 EVT VT = Op.getValueType();
1589 if (VT.getVectorElementType().getSizeInBits() < 32) {
1590 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1591 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1592 unsigned NewNumElt = OpBitSize / 32;
1593 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1595 MVT::i32, NewNumElt);
1596 for (const SDUse &U : Op->ops()) {
1597 SDValue In = U.get();
1598 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1599 if (NewNumElt > 1)
1600 DAG.ExtractVectorElements(NewIn, Args);
1601 else
1602 Args.push_back(NewIn);
1603 }
1604
1605 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1606 NewNumElt * Op.getNumOperands());
1607 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1608 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1609 }
1610 }
1611
1612 for (const SDUse &U : Op->ops())
1613 DAG.ExtractVectorElements(U.get(), Args);
1614
1615 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1616}
1617
1619 SelectionDAG &DAG) const {
1620 SDLoc SL(Op);
1622 unsigned Start = Op.getConstantOperandVal(1);
1623 EVT VT = Op.getValueType();
1624 EVT SrcVT = Op.getOperand(0).getValueType();
1625
1626 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1627 unsigned NumElt = VT.getVectorNumElements();
1628 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1629 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1630
1631 // Extract 32-bit registers at a time.
1632 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1633 EVT NewVT = NumElt == 2
1634 ? MVT::i32
1635 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1636 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1637
1638 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1639 if (NumElt == 2)
1640 Tmp = Args[0];
1641 else
1642 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1643
1644 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1645 }
1646
1647 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1649
1650 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1651}
1652
1653// TODO: Handle fabs too
1655 if (Val.getOpcode() == ISD::FNEG)
1656 return Val.getOperand(0);
1657
1658 return Val;
1659}
1660
1662 if (Val.getOpcode() == ISD::FNEG)
1663 Val = Val.getOperand(0);
1664 if (Val.getOpcode() == ISD::FABS)
1665 Val = Val.getOperand(0);
1666 if (Val.getOpcode() == ISD::FCOPYSIGN)
1667 Val = Val.getOperand(0);
1668 return Val;
1669}
1670
1672 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1673 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1674 SelectionDAG &DAG = DCI.DAG;
1675 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1676 switch (CCOpcode) {
1677 case ISD::SETOEQ:
1678 case ISD::SETONE:
1679 case ISD::SETUNE:
1680 case ISD::SETNE:
1681 case ISD::SETUEQ:
1682 case ISD::SETEQ:
1683 case ISD::SETFALSE:
1684 case ISD::SETFALSE2:
1685 case ISD::SETTRUE:
1686 case ISD::SETTRUE2:
1687 case ISD::SETUO:
1688 case ISD::SETO:
1689 break;
1690 case ISD::SETULE:
1691 case ISD::SETULT: {
1692 if (LHS == True)
1693 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1694 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1695 }
1696 case ISD::SETOLE:
1697 case ISD::SETOLT:
1698 case ISD::SETLE:
1699 case ISD::SETLT: {
1700 // Ordered. Assume ordered for undefined.
1701
1702 // Only do this after legalization to avoid interfering with other combines
1703 // which might occur.
1705 !DCI.isCalledByLegalizer())
1706 return SDValue();
1707
1708 // We need to permute the operands to get the correct NaN behavior. The
1709 // selected operand is the second one based on the failing compare with NaN,
1710 // so permute it based on the compare type the hardware uses.
1711 if (LHS == True)
1712 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1713 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1714 }
1715 case ISD::SETUGE:
1716 case ISD::SETUGT: {
1717 if (LHS == True)
1718 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1719 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1720 }
1721 case ISD::SETGT:
1722 case ISD::SETGE:
1723 case ISD::SETOGE:
1724 case ISD::SETOGT: {
1726 !DCI.isCalledByLegalizer())
1727 return SDValue();
1728
1729 if (LHS == True)
1730 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1731 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1732 }
1733 case ISD::SETCC_INVALID:
1734 llvm_unreachable("Invalid setcc condcode!");
1735 }
1736 return SDValue();
1737}
1738
1739/// Generate Min/Max node
1741 SDValue LHS, SDValue RHS,
1742 SDValue True, SDValue False,
1743 SDValue CC,
1744 DAGCombinerInfo &DCI) const {
1745 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1746 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1747
1748 SelectionDAG &DAG = DCI.DAG;
1749
1750 // If we can't directly match this, try to see if we can fold an fneg to
1751 // match.
1752
1755 SDValue NegTrue = peekFNeg(True);
1756
1757 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1758 // fmin/fmax.
1759 //
1760 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1761 // -> fneg (fmin_legacy lhs, K)
1762 //
1763 // TODO: Use getNegatedExpression
1764 if (LHS == NegTrue && CFalse && CRHS) {
1765 APFloat NegRHS = neg(CRHS->getValueAPF());
1766 if (NegRHS == CFalse->getValueAPF()) {
1767 SDValue Combined =
1768 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1769 if (Combined)
1770 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1771 return SDValue();
1772 }
1773 }
1774
1775 return SDValue();
1776}
1777
1778std::pair<SDValue, SDValue>
1780 SDLoc SL(Op);
1781
1782 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1783
1784 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1785 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1786
1787 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1788 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1789
1790 return std::pair(Lo, Hi);
1791}
1792
1794 SDLoc SL(Op);
1795
1796 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1797 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1798 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1799}
1800
1802 SDLoc SL(Op);
1803
1804 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1805 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1806 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1807}
1808
1809// Split a vector type into two parts. The first part is a power of two vector.
1810// The second part is whatever is left over, and is a scalar if it would
1811// otherwise be a 1-vector.
1812std::pair<EVT, EVT>
1814 EVT LoVT, HiVT;
1815 EVT EltVT = VT.getVectorElementType();
1816 unsigned NumElts = VT.getVectorNumElements();
1817 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1818 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1819 HiVT = NumElts - LoNumElts == 1
1820 ? EltVT
1821 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1822 return std::pair(LoVT, HiVT);
1823}
1824
1825// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1826// scalar.
1827std::pair<SDValue, SDValue>
1829 const EVT &LoVT, const EVT &HiVT,
1830 SelectionDAG &DAG) const {
1831 EVT VT = N.getValueType();
1833 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1834 VT.getVectorNumElements() &&
1835 "More vector elements requested than available!");
1837 DAG.getVectorIdxConstant(0, DL));
1838
1839 unsigned LoNumElts = LoVT.getVectorNumElements();
1840
1841 if (HiVT.isVector()) {
1842 unsigned HiNumElts = HiVT.getVectorNumElements();
1843 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1844 // Avoid creating an extract_subvector with an index that isn't a multiple
1845 // of the result type.
1847 DAG.getConstant(LoNumElts, DL, MVT::i32));
1848 return {Lo, Hi};
1849 }
1850
1852 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1853 /*Count=*/HiNumElts);
1854 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1855 return {Lo, Hi};
1856 }
1857
1859 DAG.getVectorIdxConstant(LoNumElts, DL));
1860 return {Lo, Hi};
1861}
1862
1864 SelectionDAG &DAG) const {
1866 EVT VT = Op.getValueType();
1867 SDLoc SL(Op);
1868
1869
1870 // If this is a 2 element vector, we really want to scalarize and not create
1871 // weird 1 element vectors.
1872 if (VT.getVectorNumElements() == 2) {
1873 SDValue Ops[2];
1874 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1875 return DAG.getMergeValues(Ops, SL);
1876 }
1877
1878 SDValue BasePtr = Load->getBasePtr();
1879 EVT MemVT = Load->getMemoryVT();
1880
1881 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1882
1883 EVT LoVT, HiVT;
1884 EVT LoMemVT, HiMemVT;
1885 SDValue Lo, Hi;
1886
1887 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1888 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1889 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1890
1891 unsigned Size = LoMemVT.getStoreSize();
1892 Align BaseAlign = Load->getAlign();
1893 Align HiAlign = commonAlignment(BaseAlign, Size);
1894
1895 SDValue LoLoad = DAG.getExtLoad(
1896 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1897 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1898 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1899 SDValue HiLoad = DAG.getExtLoad(
1900 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1901 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1902 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1903
1904 SDValue Join;
1905 if (LoVT == HiVT) {
1906 // This is the case that the vector is power of two so was evenly split.
1907 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1908 } else {
1909 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1910 DAG.getVectorIdxConstant(0, SL));
1911 Join = DAG.getNode(
1913 VT, Join, HiLoad,
1915 }
1916
1917 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1918 LoLoad.getValue(1), HiLoad.getValue(1))};
1919
1920 return DAG.getMergeValues(Ops, SL);
1921}
1922
1924 SelectionDAG &DAG) const {
1926 EVT VT = Op.getValueType();
1927 SDValue BasePtr = Load->getBasePtr();
1928 EVT MemVT = Load->getMemoryVT();
1929 SDLoc SL(Op);
1930 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1931 Align BaseAlign = Load->getAlign();
1932 unsigned NumElements = MemVT.getVectorNumElements();
1933
1934 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1935 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1936 if (NumElements != 3 ||
1937 (BaseAlign < Align(8) &&
1938 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1939 return SplitVectorLoad(Op, DAG);
1940
1941 assert(NumElements == 3);
1942
1943 EVT WideVT =
1945 EVT WideMemVT =
1947 SDValue WideLoad = DAG.getExtLoad(
1948 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1949 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1950 return DAG.getMergeValues(
1951 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1952 DAG.getVectorIdxConstant(0, SL)),
1953 WideLoad.getValue(1)},
1954 SL);
1955}
1956
1958 SelectionDAG &DAG) const {
1960 SDValue Val = Store->getValue();
1961 EVT VT = Val.getValueType();
1962
1963 // If this is a 2 element vector, we really want to scalarize and not create
1964 // weird 1 element vectors.
1965 if (VT.getVectorNumElements() == 2)
1966 return scalarizeVectorStore(Store, DAG);
1967
1968 EVT MemVT = Store->getMemoryVT();
1969 SDValue Chain = Store->getChain();
1970 SDValue BasePtr = Store->getBasePtr();
1971 SDLoc SL(Op);
1972
1973 EVT LoVT, HiVT;
1974 EVT LoMemVT, HiMemVT;
1975 SDValue Lo, Hi;
1976
1977 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1978 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1979 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1980
1981 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1982
1983 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1984 Align BaseAlign = Store->getAlign();
1985 unsigned Size = LoMemVT.getStoreSize();
1986 Align HiAlign = commonAlignment(BaseAlign, Size);
1987
1988 SDValue LoStore =
1989 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1990 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1991 SDValue HiStore = DAG.getTruncStore(
1992 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1993 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1994
1995 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1996}
1997
1998// This is a shortcut for integer division because we have fast i32<->f32
1999// conversions, and fast f32 reciprocal instructions. The fractional part of a
2000// float is enough to accurately represent up to a 24-bit signed integer.
2002 bool Sign) const {
2003 SDLoc DL(Op);
2004 EVT VT = Op.getValueType();
2005 SDValue LHS = Op.getOperand(0);
2006 SDValue RHS = Op.getOperand(1);
2007 MVT IntVT = MVT::i32;
2008 MVT FltVT = MVT::f32;
2009
2010 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2011 if (LHSSignBits < 9)
2012 return SDValue();
2013
2014 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2015 if (RHSSignBits < 9)
2016 return SDValue();
2017
2018 unsigned BitSize = VT.getSizeInBits();
2019 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2020 unsigned DivBits = BitSize - SignBits;
2021 if (Sign)
2022 ++DivBits;
2023
2026
2027 SDValue jq = DAG.getConstant(1, DL, IntVT);
2028
2029 if (Sign) {
2030 // char|short jq = ia ^ ib;
2031 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2032
2033 // jq = jq >> (bitsize - 2)
2034 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2035 DAG.getConstant(BitSize - 2, DL, VT));
2036
2037 // jq = jq | 0x1
2038 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2039 }
2040
2041 // int ia = (int)LHS;
2042 SDValue ia = LHS;
2043
2044 // int ib, (int)RHS;
2045 SDValue ib = RHS;
2046
2047 // float fa = (float)ia;
2048 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2049
2050 // float fb = (float)ib;
2051 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2052
2053 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2054 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2055
2056 // fq = trunc(fq);
2057 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2058
2059 // float fqneg = -fq;
2060 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2061
2063
2064 bool UseFmadFtz = false;
2065 if (Subtarget->isGCN()) {
2067 UseFmadFtz =
2069 }
2070
2071 // float fr = mad(fqneg, fb, fa);
2072 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2073 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2075 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2076
2077 // int iq = (int)fq;
2078 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2079
2080 // fr = fabs(fr);
2081 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2082
2083 // fb = fabs(fb);
2084 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2085
2086 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2087
2088 // int cv = fr >= fb;
2089 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2090
2091 // jq = (cv ? jq : 0);
2092 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2093
2094 // dst = iq + jq;
2095 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2096
2097 // Rem needs compensation, it's easier to recompute it
2098 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2099 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2100
2101 // Truncate to number of bits this divide really is.
2102 if (Sign) {
2103 SDValue InRegSize
2104 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2105 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2106 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2107 } else {
2108 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2109 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2110 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2111 }
2112
2113 return DAG.getMergeValues({ Div, Rem }, DL);
2114}
2115
2117 SelectionDAG &DAG,
2119 SDLoc DL(Op);
2120 EVT VT = Op.getValueType();
2121
2122 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2123
2124 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2125
2126 SDValue One = DAG.getConstant(1, DL, HalfVT);
2127 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2128
2129 //HiLo split
2130 SDValue LHS_Lo, LHS_Hi;
2131 SDValue LHS = Op.getOperand(0);
2132 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2133
2134 SDValue RHS_Lo, RHS_Hi;
2135 SDValue RHS = Op.getOperand(1);
2136 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2137
2138 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2139 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2140
2141 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2142 LHS_Lo, RHS_Lo);
2143
2144 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2145 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2146
2147 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2148 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2149 return;
2150 }
2151
2152 if (isTypeLegal(MVT::i64)) {
2153 // The algorithm here is based on ideas from "Software Integer Division",
2154 // Tom Rodeheffer, August 2008.
2155
2158
2159 // Compute denominator reciprocal.
2160 unsigned FMAD =
2161 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2164 : (unsigned)AMDGPUISD::FMAD_FTZ;
2165
2166 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2167 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2168 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2169 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2170 Cvt_Lo);
2171 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2172 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2173 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2174 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2175 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2176 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2177 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2178 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2179 Mul1);
2180 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2181 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2182 SDValue Rcp64 = DAG.getBitcast(VT,
2183 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2184
2185 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2186 SDValue One64 = DAG.getConstant(1, DL, VT);
2187 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2188 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2189
2190 // First round of UNR (Unsigned integer Newton-Raphson).
2191 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2192 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2193 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2194 SDValue Mulhi1_Lo, Mulhi1_Hi;
2195 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2196 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2197 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2198 Mulhi1_Lo, Zero1);
2199 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2200 Mulhi1_Hi, Add1_Lo.getValue(1));
2201 SDValue Add1 = DAG.getBitcast(VT,
2202 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2203
2204 // Second round of UNR.
2205 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2206 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2207 SDValue Mulhi2_Lo, Mulhi2_Hi;
2208 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2209 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2210 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2211 Mulhi2_Lo, Zero1);
2212 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2213 Mulhi2_Hi, Add2_Lo.getValue(1));
2214 SDValue Add2 = DAG.getBitcast(VT,
2215 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2216
2217 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2218
2219 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2220
2221 SDValue Mul3_Lo, Mul3_Hi;
2222 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2223 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2224 Mul3_Lo, Zero1);
2225 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2226 Mul3_Hi, Sub1_Lo.getValue(1));
2227 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2228 SDValue Sub1 = DAG.getBitcast(VT,
2229 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2230
2231 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2232 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2233 ISD::SETUGE);
2234 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2235 ISD::SETUGE);
2236 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2237
2238 // TODO: Here and below portions of the code can be enclosed into if/endif.
2239 // Currently control flow is unconditional and we have 4 selects after
2240 // potential endif to substitute PHIs.
2241
2242 // if C3 != 0 ...
2243 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2244 RHS_Lo, Zero1);
2245 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2246 RHS_Hi, Sub1_Lo.getValue(1));
2247 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2248 Zero, Sub2_Lo.getValue(1));
2249 SDValue Sub2 = DAG.getBitcast(VT,
2250 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2251
2252 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2253
2254 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2255 ISD::SETUGE);
2256 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2257 ISD::SETUGE);
2258 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2259
2260 // if (C6 != 0)
2261 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2262
2263 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2264 RHS_Lo, Zero1);
2265 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2266 RHS_Hi, Sub2_Lo.getValue(1));
2267 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2268 Zero, Sub3_Lo.getValue(1));
2269 SDValue Sub3 = DAG.getBitcast(VT,
2270 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2271
2272 // endif C6
2273 // endif C3
2274
2275 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2276 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2277
2278 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2279 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2280
2281 Results.push_back(Div);
2282 Results.push_back(Rem);
2283
2284 return;
2285 }
2286
2287 // r600 expandion.
2288 // Get Speculative values
2289 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2290 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2291
2292 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2293 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2294 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2295
2296 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2297 SDValue DIV_Lo = Zero;
2298
2299 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2300
2301 for (unsigned i = 0; i < halfBitWidth; ++i) {
2302 const unsigned bitPos = halfBitWidth - i - 1;
2303 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2304 // Get value of high bit
2305 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2306 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2307 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2308
2309 // Shift
2310 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2311 // Add LHS high bit
2312 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2313
2314 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2315 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2316
2317 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2318
2319 // Update REM
2320 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2321 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2322 }
2323
2324 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2325 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2326 Results.push_back(DIV);
2327 Results.push_back(REM);
2328}
2329
2331 SelectionDAG &DAG) const {
2332 SDLoc DL(Op);
2333 EVT VT = Op.getValueType();
2334
2335 if (VT == MVT::i64) {
2337 LowerUDIVREM64(Op, DAG, Results);
2338 return DAG.getMergeValues(Results, DL);
2339 }
2340
2341 if (VT == MVT::i32) {
2342 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2343 return Res;
2344 }
2345
2346 SDValue X = Op.getOperand(0);
2347 SDValue Y = Op.getOperand(1);
2348
2349 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2350 // algorithm used here.
2351
2352 // Initial estimate of inv(y).
2353 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2354
2355 // One round of UNR.
2356 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2357 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2358 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2359 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2360
2361 // Quotient/remainder estimate.
2362 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2363 SDValue R =
2364 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2365
2366 // First quotient/remainder refinement.
2367 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2368 SDValue One = DAG.getConstant(1, DL, VT);
2369 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2370 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2371 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2372 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2373 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2374
2375 // Second quotient/remainder refinement.
2376 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2377 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2378 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2379 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2380 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2381
2382 return DAG.getMergeValues({Q, R}, DL);
2383}
2384
2386 SelectionDAG &DAG) const {
2387 SDLoc DL(Op);
2388 EVT VT = Op.getValueType();
2389
2390 SDValue LHS = Op.getOperand(0);
2391 SDValue RHS = Op.getOperand(1);
2392
2393 SDValue Zero = DAG.getConstant(0, DL, VT);
2394 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2395
2396 if (VT == MVT::i32) {
2397 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2398 return Res;
2399 }
2400
2401 if (VT == MVT::i64 &&
2402 DAG.ComputeNumSignBits(LHS) > 32 &&
2403 DAG.ComputeNumSignBits(RHS) > 32) {
2404 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2405
2406 //HiLo split
2407 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2408 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2409 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2410 LHS_Lo, RHS_Lo);
2411 SDValue Res[2] = {
2412 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2413 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2414 };
2415 return DAG.getMergeValues(Res, DL);
2416 }
2417
2418 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2419 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2420 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2421 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2422
2423 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2424 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2425
2426 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2427 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2428
2429 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2430 SDValue Rem = Div.getValue(1);
2431
2432 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2433 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2434
2435 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2436 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2437
2438 SDValue Res[2] = {
2439 Div,
2440 Rem
2441 };
2442 return DAG.getMergeValues(Res, DL);
2443}
2444
2446 SDLoc SL(Op);
2447 SDValue Src = Op.getOperand(0);
2448
2449 // result = trunc(src)
2450 // if (src > 0.0 && src != result)
2451 // result += 1.0
2452
2453 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2454
2455 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2456 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2457
2458 EVT SetCCVT =
2459 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2460
2461 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2462 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2463 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2464
2465 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2466 // TODO: Should this propagate fast-math-flags?
2467 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2468}
2469
2471 SelectionDAG &DAG) {
2472 const unsigned FractBits = 52;
2473 const unsigned ExpBits = 11;
2474
2475 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2476 Hi,
2477 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2478 DAG.getConstant(ExpBits, SL, MVT::i32));
2479 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2480 DAG.getConstant(1023, SL, MVT::i32));
2481
2482 return Exp;
2483}
2484
2486 SDLoc SL(Op);
2487 SDValue Src = Op.getOperand(0);
2488
2489 assert(Op.getValueType() == MVT::f64);
2490
2491 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2492
2493 // Extract the upper half, since this is where we will find the sign and
2494 // exponent.
2495 SDValue Hi = getHiHalf64(Src, DAG);
2496
2497 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2498
2499 const unsigned FractBits = 52;
2500
2501 // Extract the sign bit.
2502 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2503 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2504
2505 // Extend back to 64-bits.
2506 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2507 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2508
2509 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2510 const SDValue FractMask
2511 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2512
2513 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2514 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2515 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2516
2517 EVT SetCCVT =
2518 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2519
2520 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2521
2522 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2523 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2524
2525 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2526 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2527
2528 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2529}
2530
2532 SelectionDAG &DAG) const {
2533 SDLoc SL(Op);
2534 SDValue Src = Op.getOperand(0);
2535
2536 assert(Op.getValueType() == MVT::f64);
2537
2538 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2539 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2540 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2541
2542 // TODO: Should this propagate fast-math-flags?
2543
2544 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2545 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2546
2547 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2548
2549 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2550 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2551
2552 EVT SetCCVT =
2553 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2554 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2555
2556 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2557}
2558
2560 SelectionDAG &DAG) const {
2561 // FNEARBYINT and FRINT are the same, except in their handling of FP
2562 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2563 // rint, so just treat them as equivalent.
2564 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2565 Op.getOperand(0));
2566}
2567
2569 auto VT = Op.getValueType();
2570 auto Arg = Op.getOperand(0u);
2571 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2572}
2573
2574// XXX - May require not supporting f32 denormals?
2575
2576// Don't handle v2f16. The extra instructions to scalarize and repack around the
2577// compare and vselect end up producing worse code than scalarizing the whole
2578// operation.
2580 SDLoc SL(Op);
2581 SDValue X = Op.getOperand(0);
2582 EVT VT = Op.getValueType();
2583
2584 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2585
2586 // TODO: Should this propagate fast-math-flags?
2587
2588 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2589
2590 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2591
2592 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2593 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2594
2595 EVT SetCCVT =
2596 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2597
2598 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2599 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2600 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2601
2602 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2603 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2604}
2605
2607 SDLoc SL(Op);
2608 SDValue Src = Op.getOperand(0);
2609
2610 // result = trunc(src);
2611 // if (src < 0.0 && src != result)
2612 // result += -1.0.
2613
2614 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2615
2616 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2617 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2618
2619 EVT SetCCVT =
2620 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2621
2622 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2623 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2624 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2625
2626 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2627 // TODO: Should this propagate fast-math-flags?
2628 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2629}
2630
2631/// Return true if it's known that \p Src can never be an f32 denormal value.
2633 switch (Src.getOpcode()) {
2634 case ISD::FP_EXTEND:
2635 return Src.getOperand(0).getValueType() == MVT::f16;
2636 case ISD::FP16_TO_FP:
2637 case ISD::FFREXP:
2638 return true;
2640 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2641 switch (IntrinsicID) {
2642 case Intrinsic::amdgcn_frexp_mant:
2643 return true;
2644 default:
2645 return false;
2646 }
2647 }
2648 default:
2649 return false;
2650 }
2651
2652 llvm_unreachable("covered opcode switch");
2653}
2654
2656 SDNodeFlags Flags) {
2657 return Flags.hasApproximateFuncs();
2658}
2659
2668
2670 SDValue Src,
2671 SDNodeFlags Flags) const {
2672 SDLoc SL(Src);
2673 EVT VT = Src.getValueType();
2674 const fltSemantics &Semantics = VT.getFltSemantics();
2675 SDValue SmallestNormal =
2676 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2677
2678 // Want to scale denormals up, but negatives and 0 work just as well on the
2679 // scaled path.
2680 SDValue IsLtSmallestNormal = DAG.getSetCC(
2681 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2682 SmallestNormal, ISD::SETOLT);
2683
2684 return IsLtSmallestNormal;
2685}
2686
2688 SDNodeFlags Flags) const {
2689 SDLoc SL(Src);
2690 EVT VT = Src.getValueType();
2691 const fltSemantics &Semantics = VT.getFltSemantics();
2692 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2693
2694 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2695 SDValue IsFinite = DAG.getSetCC(
2696 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2697 Inf, ISD::SETOLT);
2698 return IsFinite;
2699}
2700
2701/// If denormal handling is required return the scaled input to FLOG2, and the
2702/// check for denormal range. Otherwise, return null values.
2703std::pair<SDValue, SDValue>
2705 SDValue Src, SDNodeFlags Flags) const {
2706 if (!needsDenormHandlingF32(DAG, Src, Flags))
2707 return {};
2708
2709 MVT VT = MVT::f32;
2710 const fltSemantics &Semantics = APFloat::IEEEsingle();
2711 SDValue SmallestNormal =
2712 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2713
2714 SDValue IsLtSmallestNormal = DAG.getSetCC(
2715 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2716 SmallestNormal, ISD::SETOLT);
2717
2718 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2719 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2720 SDValue ScaleFactor =
2721 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2722
2723 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2724 return {ScaledInput, IsLtSmallestNormal};
2725}
2726
2728 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2729 // If we have to handle denormals, scale up the input and adjust the result.
2730
2731 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2732 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2733
2734 SDLoc SL(Op);
2735 EVT VT = Op.getValueType();
2736 SDValue Src = Op.getOperand(0);
2737 SDNodeFlags Flags = Op->getFlags();
2738
2739 if (VT == MVT::f16) {
2740 // Nothing in half is a denormal when promoted to f32.
2741 assert(!Subtarget->has16BitInsts());
2742 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2743 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2744 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2745 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2746 }
2747
2748 auto [ScaledInput, IsLtSmallestNormal] =
2749 getScaledLogInput(DAG, SL, Src, Flags);
2750 if (!ScaledInput)
2751 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2752
2753 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2754
2755 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2756 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2757 SDValue ResultOffset =
2758 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2759 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2760}
2761
2762static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2763 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2764 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2765 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2766}
2767
2769 SelectionDAG &DAG) const {
2770 SDValue X = Op.getOperand(0);
2771 EVT VT = Op.getValueType();
2772 SDNodeFlags Flags = Op->getFlags();
2773 SDLoc DL(Op);
2774 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2775 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2776
2777 const auto &Options = getTargetMachine().Options;
2778 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2779
2780 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2781 // Log and multiply in f32 is good enough for f16.
2782 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2783 }
2784
2785 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2786 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2787 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2788 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2789 }
2790
2791 return Lowered;
2792 }
2793
2794 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2795 if (ScaledInput)
2796 X = ScaledInput;
2797
2798 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2799
2800 SDValue R;
2801 if (Subtarget->hasFastFMAF32()) {
2802 // c+cc are ln(2)/ln(10) to more than 49 bits
2803 const float c_log10 = 0x1.344134p-2f;
2804 const float cc_log10 = 0x1.09f79ep-26f;
2805
2806 // c + cc is ln(2) to more than 49 bits
2807 const float c_log = 0x1.62e42ep-1f;
2808 const float cc_log = 0x1.efa39ep-25f;
2809
2810 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2811 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2812 // This adds correction terms for which contraction may lead to an increase
2813 // in the error of the approximation, so disable it.
2814 Flags.setAllowContract(false);
2815 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2816 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2817 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2818 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2819 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2820 } else {
2821 // ch+ct is ln(2)/ln(10) to more than 36 bits
2822 const float ch_log10 = 0x1.344000p-2f;
2823 const float ct_log10 = 0x1.3509f6p-18f;
2824
2825 // ch + ct is ln(2) to more than 36 bits
2826 const float ch_log = 0x1.62e000p-1f;
2827 const float ct_log = 0x1.0bfbe8p-15f;
2828
2829 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2830 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2831
2832 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2833 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2834 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2835 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2836 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2837 // This adds correction terms for which contraction may lead to an increase
2838 // in the error of the approximation, so disable it.
2839 Flags.setAllowContract(false);
2840 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2841 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2842 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2843 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2844 }
2845
2846 const bool IsFiniteOnly =
2847 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2848
2849 // TODO: Check if known finite from source value.
2850 if (!IsFiniteOnly) {
2851 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2852 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2853 }
2854
2855 if (IsScaled) {
2856 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2857 SDValue ShiftK =
2858 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2859 SDValue Shift =
2860 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2861 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2862 }
2863
2864 return R;
2865}
2866
2870
2871// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2872// promote f16 operation.
2874 SelectionDAG &DAG, bool IsLog10,
2875 SDNodeFlags Flags) const {
2876 EVT VT = Src.getValueType();
2877 unsigned LogOp =
2878 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2879
2880 double Log2BaseInverted =
2882
2883 if (VT == MVT::f32) {
2884 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2885 if (ScaledInput) {
2886 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2887 SDValue ScaledResultOffset =
2888 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2889
2890 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2891
2892 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2893 ScaledResultOffset, Zero, Flags);
2894
2895 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2896
2897 if (Subtarget->hasFastFMAF32())
2898 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2899 Flags);
2900 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2901 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2902 }
2903 }
2904
2905 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2906 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2907
2908 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2909 Flags);
2910}
2911
2913 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2914 // If we have to handle denormals, scale up the input and adjust the result.
2915
2916 SDLoc SL(Op);
2917 EVT VT = Op.getValueType();
2918 SDValue Src = Op.getOperand(0);
2919 SDNodeFlags Flags = Op->getFlags();
2920
2921 if (VT == MVT::f16) {
2922 // Nothing in half is a denormal when promoted to f32.
2923 assert(!Subtarget->has16BitInsts());
2924 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2925 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2926 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2927 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2928 }
2929
2930 assert(VT == MVT::f32);
2931
2932 if (!needsDenormHandlingF32(DAG, Src, Flags))
2933 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2934
2935 // bool needs_scaling = x < -0x1.f80000p+6f;
2936 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2937
2938 // -nextafter(128.0, -1)
2939 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2940
2941 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2942
2943 SDValue NeedsScaling =
2944 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2945
2946 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2947 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2948
2949 SDValue AddOffset =
2950 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2951
2952 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2953 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2954
2955 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2956 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2957 SDValue ResultScale =
2958 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2959
2960 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2961}
2962
2964 SelectionDAG &DAG,
2965 SDNodeFlags Flags,
2966 bool IsExp10) const {
2967 // exp(x) -> exp2(M_LOG2E_F * x);
2968 // exp10(x) -> exp2(log2(10) * x);
2969 EVT VT = X.getValueType();
2970 SDValue Const =
2971 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
2972
2973 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
2974 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2975 : (unsigned)ISD::FEXP2,
2976 SL, VT, Mul, Flags);
2977}
2978
2980 SelectionDAG &DAG,
2981 SDNodeFlags Flags) const {
2982 EVT VT = X.getValueType();
2983 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
2984 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2985
2986 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2987
2988 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2989 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2990
2991 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2992
2993 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2994
2995 SDValue AdjustedX =
2996 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2997
2998 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2999 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3000
3001 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3002
3003 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3004 SDValue AdjustedResult =
3005 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3006
3007 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3008 Flags);
3009}
3010
3011/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3012/// handled correctly.
3014 SelectionDAG &DAG,
3015 SDNodeFlags Flags) const {
3016 const EVT VT = X.getValueType();
3017
3018 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3019 : static_cast<unsigned>(ISD::FEXP2);
3020
3021 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3022 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3023 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3024 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3025
3026 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3027 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3028 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3029 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3030 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3031 }
3032
3033 // bool s = x < -0x1.2f7030p+5f;
3034 // x += s ? 0x1.0p+5f : 0.0f;
3035 // exp10 = exp2(x * 0x1.a92000p+1f) *
3036 // exp2(x * 0x1.4f0978p-11f) *
3037 // (s ? 0x1.9f623ep-107f : 1.0f);
3038
3039 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3040
3041 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3042 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3043
3044 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3045 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3046 SDValue AdjustedX =
3047 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3048
3049 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3050 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3051
3052 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3053 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3054 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3055 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3056
3057 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3058
3059 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3060 SDValue AdjustedResult =
3061 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3062
3063 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3064 Flags);
3065}
3066
3068 EVT VT = Op.getValueType();
3069 SDLoc SL(Op);
3070 SDValue X = Op.getOperand(0);
3071 SDNodeFlags Flags = Op->getFlags();
3072 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3073
3074 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3075 // library behavior. Also, is known-not-daz source sufficient?
3076 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3077 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3078 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3079 }
3080
3081 if (VT.getScalarType() == MVT::f16) {
3082 if (VT.isVector())
3083 return SDValue();
3084
3085 // Nothing in half is a denormal when promoted to f32.
3086 //
3087 // exp(f16 x) ->
3088 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3089 //
3090 // exp10(f16 x) ->
3091 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3092 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3093 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3094 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3095 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3096 }
3097
3098 assert(VT == MVT::f32);
3099
3100 // Algorithm:
3101 //
3102 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3103 //
3104 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3105 // n = 64*m + j, 0 <= j < 64
3106 //
3107 // e^x = 2^((64*m + j + f)/64)
3108 // = (2^m) * (2^(j/64)) * 2^(f/64)
3109 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3110 //
3111 // f = x*(64/ln(2)) - n
3112 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3113 //
3114 // e^x = (2^m) * (2^(j/64)) * e^r
3115 //
3116 // (2^(j/64)) is precomputed
3117 //
3118 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3119 // e^r = 1 + q
3120 //
3121 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3122 //
3123 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3124 SDNodeFlags FlagsNoContract = Flags;
3125 FlagsNoContract.setAllowContract(false);
3126
3127 SDValue PH, PL;
3128 if (Subtarget->hasFastFMAF32()) {
3129 const float c_exp = numbers::log2ef;
3130 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3131 const float c_exp10 = 0x1.a934f0p+1f;
3132 const float cc_exp10 = 0x1.2f346ep-24f;
3133
3134 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3135 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3136
3137 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3138 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3139 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3140 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3141 } else {
3142 const float ch_exp = 0x1.714000p+0f;
3143 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3144
3145 const float ch_exp10 = 0x1.a92000p+1f;
3146 const float cl_exp10 = 0x1.4f0978p-11f;
3147
3148 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3149 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3150
3151 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3152 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3153 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3154 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3155 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3156
3157 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3158
3159 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3160 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3161 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3162 }
3163
3164 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3165
3166 // It is unsafe to contract this fsub into the PH multiply.
3167 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3168
3169 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3170 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3171 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3172
3173 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3174
3175 SDValue UnderflowCheckConst =
3176 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3177
3178 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3179 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3180 SDValue Underflow =
3181 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3182
3183 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3184
3185 if (!Flags.hasNoInfs()) {
3186 SDValue OverflowCheckConst =
3187 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3188 SDValue Overflow =
3189 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3190 SDValue Inf =
3192 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3193 }
3194
3195 return R;
3196}
3197
3198static bool isCtlzOpc(unsigned Opc) {
3199 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3200}
3201
3202static bool isCttzOpc(unsigned Opc) {
3203 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3204}
3205
3207 SelectionDAG &DAG) const {
3208 auto SL = SDLoc(Op);
3209 auto Opc = Op.getOpcode();
3210 auto Arg = Op.getOperand(0u);
3211 auto ResultVT = Op.getValueType();
3212
3213 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3214 return {};
3215
3217 assert(ResultVT == Arg.getValueType());
3218
3219 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3220 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3221 SDValue NewOp;
3222
3223 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3224 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3225 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3226 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3227 } else {
3228 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3229 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3230 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3231 }
3232
3233 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3234}
3235
3237 SDLoc SL(Op);
3238 SDValue Src = Op.getOperand(0);
3239
3240 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3241 bool Ctlz = isCtlzOpc(Op.getOpcode());
3242 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3243
3244 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3245 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3246 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3247
3248 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3249 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3250 // (cttz hi:lo) -> (umin (ffbl src), 32)
3251 // (ctlz_zero_undef src) -> (ffbh src)
3252 // (cttz_zero_undef src) -> (ffbl src)
3253
3254 // 64-bit scalar version produce 32-bit result
3255 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3256 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3257 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3258 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3259 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3260 if (!ZeroUndef) {
3261 const SDValue ConstVal = DAG.getConstant(
3262 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3263 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3264 }
3265 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3266 }
3267
3268 SDValue Lo, Hi;
3269 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3270
3271 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3272 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3273
3274 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3275 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3276 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3277 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3278
3279 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3280 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3281 if (Ctlz)
3282 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3283 else
3284 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3285
3286 SDValue NewOpr;
3287 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3288 if (!ZeroUndef) {
3289 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3290 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3291 }
3292
3293 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3294}
3295
3297 bool Signed) const {
3298 // The regular method converting a 64-bit integer to float roughly consists of
3299 // 2 steps: normalization and rounding. In fact, after normalization, the
3300 // conversion from a 64-bit integer to a float is essentially the same as the
3301 // one from a 32-bit integer. The only difference is that it has more
3302 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3303 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3304 // converted into the correct float number. The basic steps for the unsigned
3305 // conversion are illustrated in the following pseudo code:
3306 //
3307 // f32 uitofp(i64 u) {
3308 // i32 hi, lo = split(u);
3309 // // Only count the leading zeros in hi as we have native support of the
3310 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3311 // // reduced to a 32-bit one automatically.
3312 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3313 // u <<= shamt;
3314 // hi, lo = split(u);
3315 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3316 // // convert it as a 32-bit integer and scale the result back.
3317 // return uitofp(hi) * 2^(32 - shamt);
3318 // }
3319 //
3320 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3321 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3322 // converted instead followed by negation based its sign bit.
3323
3324 SDLoc SL(Op);
3325 SDValue Src = Op.getOperand(0);
3326
3327 SDValue Lo, Hi;
3328 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3329 SDValue Sign;
3330 SDValue ShAmt;
3331 if (Signed && Subtarget->isGCN()) {
3332 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3333 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3334 // account. That is, the maximal shift is
3335 // - 32 if Lo and Hi have opposite signs;
3336 // - 33 if Lo and Hi have the same sign.
3337 //
3338 // Or, MaxShAmt = 33 + OppositeSign, where
3339 //
3340 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3341 // - -1 if Lo and Hi have opposite signs; and
3342 // - 0 otherwise.
3343 //
3344 // All in all, ShAmt is calculated as
3345 //
3346 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3347 //
3348 // or
3349 //
3350 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3351 //
3352 // to reduce the critical path.
3353 SDValue OppositeSign = DAG.getNode(
3354 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3355 DAG.getConstant(31, SL, MVT::i32));
3356 SDValue MaxShAmt =
3357 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3358 OppositeSign);
3359 // Count the leading sign bits.
3360 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3361 // Different from unsigned conversion, the shift should be one bit less to
3362 // preserve the sign bit.
3363 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3364 DAG.getConstant(1, SL, MVT::i32));
3365 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3366 } else {
3367 if (Signed) {
3368 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3369 // absolute value first.
3370 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3371 DAG.getConstant(63, SL, MVT::i64));
3372 SDValue Abs =
3373 DAG.getNode(ISD::XOR, SL, MVT::i64,
3374 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3375 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3376 }
3377 // Count the leading zeros.
3378 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3379 // The shift amount for signed integers is [0, 32].
3380 }
3381 // Normalize the given 64-bit integer.
3382 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3383 // Split it again.
3384 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3385 // Calculate the adjust bit for rounding.
3386 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3387 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3388 DAG.getConstant(1, SL, MVT::i32), Lo);
3389 // Get the 32-bit normalized integer.
3390 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3391 // Convert the normalized 32-bit integer into f32.
3392 unsigned Opc =
3393 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3394 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3395
3396 // Finally, need to scale back the converted floating number as the original
3397 // 64-bit integer is converted as a 32-bit one.
3398 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3399 ShAmt);
3400 // On GCN, use LDEXP directly.
3401 if (Subtarget->isGCN())
3402 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3403
3404 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3405 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3406 // exponent is enough to avoid overflowing into the sign bit.
3407 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3408 DAG.getConstant(23, SL, MVT::i32));
3409 SDValue IVal =
3410 DAG.getNode(ISD::ADD, SL, MVT::i32,
3411 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3412 if (Signed) {
3413 // Set the sign bit.
3414 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3415 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3416 DAG.getConstant(31, SL, MVT::i32));
3417 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3418 }
3419 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3420}
3421
3423 bool Signed) const {
3424 SDLoc SL(Op);
3425 SDValue Src = Op.getOperand(0);
3426
3427 SDValue Lo, Hi;
3428 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3429
3431 SL, MVT::f64, Hi);
3432
3433 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3434
3435 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3436 DAG.getConstant(32, SL, MVT::i32));
3437 // TODO: Should this propagate fast-math-flags?
3438 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3439}
3440
3442 SelectionDAG &DAG) const {
3443 // TODO: Factor out code common with LowerSINT_TO_FP.
3444 EVT DestVT = Op.getValueType();
3445 SDValue Src = Op.getOperand(0);
3446 EVT SrcVT = Src.getValueType();
3447
3448 if (SrcVT == MVT::i16) {
3449 if (DestVT == MVT::f16)
3450 return Op;
3451 SDLoc DL(Op);
3452
3453 // Promote src to i32
3454 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3455 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3456 }
3457
3458 if (DestVT == MVT::bf16) {
3459 SDLoc SL(Op);
3460 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3461 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3462 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3463 }
3464
3465 if (SrcVT != MVT::i64)
3466 return Op;
3467
3468 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3469 SDLoc DL(Op);
3470
3471 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3472 SDValue FPRoundFlag =
3473 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3474 SDValue FPRound =
3475 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3476
3477 return FPRound;
3478 }
3479
3480 if (DestVT == MVT::f32)
3481 return LowerINT_TO_FP32(Op, DAG, false);
3482
3483 assert(DestVT == MVT::f64);
3484 return LowerINT_TO_FP64(Op, DAG, false);
3485}
3486
3488 SelectionDAG &DAG) const {
3489 EVT DestVT = Op.getValueType();
3490
3491 SDValue Src = Op.getOperand(0);
3492 EVT SrcVT = Src.getValueType();
3493
3494 if (SrcVT == MVT::i16) {
3495 if (DestVT == MVT::f16)
3496 return Op;
3497
3498 SDLoc DL(Op);
3499 // Promote src to i32
3500 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3501 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3502 }
3503
3504 if (DestVT == MVT::bf16) {
3505 SDLoc SL(Op);
3506 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3507 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3508 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3509 }
3510
3511 if (SrcVT != MVT::i64)
3512 return Op;
3513
3514 // TODO: Factor out code common with LowerUINT_TO_FP.
3515
3516 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3517 SDLoc DL(Op);
3518 SDValue Src = Op.getOperand(0);
3519
3520 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3521 SDValue FPRoundFlag =
3522 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3523 SDValue FPRound =
3524 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3525
3526 return FPRound;
3527 }
3528
3529 if (DestVT == MVT::f32)
3530 return LowerINT_TO_FP32(Op, DAG, true);
3531
3532 assert(DestVT == MVT::f64);
3533 return LowerINT_TO_FP64(Op, DAG, true);
3534}
3535
3537 bool Signed) const {
3538 SDLoc SL(Op);
3539
3540 SDValue Src = Op.getOperand(0);
3541 EVT SrcVT = Src.getValueType();
3542
3543 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3544
3545 // The basic idea of converting a floating point number into a pair of 32-bit
3546 // integers is illustrated as follows:
3547 //
3548 // tf := trunc(val);
3549 // hif := floor(tf * 2^-32);
3550 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3551 // hi := fptoi(hif);
3552 // lo := fptoi(lof);
3553 //
3554 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3555 SDValue Sign;
3556 if (Signed && SrcVT == MVT::f32) {
3557 // However, a 32-bit floating point number has only 23 bits mantissa and
3558 // it's not enough to hold all the significant bits of `lof` if val is
3559 // negative. To avoid the loss of precision, We need to take the absolute
3560 // value after truncating and flip the result back based on the original
3561 // signedness.
3562 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3563 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3564 DAG.getConstant(31, SL, MVT::i32));
3565 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3566 }
3567
3568 SDValue K0, K1;
3569 if (SrcVT == MVT::f64) {
3570 K0 = DAG.getConstantFP(
3571 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3572 SrcVT);
3573 K1 = DAG.getConstantFP(
3574 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3575 SrcVT);
3576 } else {
3577 K0 = DAG.getConstantFP(
3578 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3579 K1 = DAG.getConstantFP(
3580 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3581 }
3582 // TODO: Should this propagate fast-math-flags?
3583 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3584
3585 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3586
3587 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3588
3589 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3591 SL, MVT::i32, FloorMul);
3592 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3593
3594 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3595 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3596
3597 if (Signed && SrcVT == MVT::f32) {
3598 assert(Sign);
3599 // Flip the result based on the signedness, which is either all 0s or 1s.
3600 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3601 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3602 // r := xor(r, sign) - sign;
3603 Result =
3604 DAG.getNode(ISD::SUB, SL, MVT::i64,
3605 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3606 }
3607
3608 return Result;
3609}
3610
3612 SDLoc DL(Op);
3613 SDValue N0 = Op.getOperand(0);
3614
3615 // Convert to target node to get known bits
3616 if (N0.getValueType() == MVT::f32)
3617 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3618
3619 if (Op->getFlags().hasApproximateFuncs()) {
3620 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3621 return SDValue();
3622 }
3623
3624 return LowerF64ToF16Safe(N0, DL, DAG);
3625}
3626
3627// return node in i32
3629 SelectionDAG &DAG) const {
3630 assert(Src.getSimpleValueType() == MVT::f64);
3631
3632 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3633 // TODO: We can generate better code for True16.
3634 const unsigned ExpMask = 0x7ff;
3635 const unsigned ExpBiasf64 = 1023;
3636 const unsigned ExpBiasf16 = 15;
3637 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3638 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3639 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3640 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3641 DAG.getConstant(32, DL, MVT::i64));
3642 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3643 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3644 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3645 DAG.getConstant(20, DL, MVT::i64));
3646 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3647 DAG.getConstant(ExpMask, DL, MVT::i32));
3648 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3649 // add the f16 bias (15) to get the biased exponent for the f16 format.
3650 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3651 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3652
3653 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3654 DAG.getConstant(8, DL, MVT::i32));
3655 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3656 DAG.getConstant(0xffe, DL, MVT::i32));
3657
3658 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3659 DAG.getConstant(0x1ff, DL, MVT::i32));
3660 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3661
3662 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3663 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3664
3665 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3666 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3667 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3668 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3669
3670 // N = M | (E << 12);
3671 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3672 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3673 DAG.getConstant(12, DL, MVT::i32)));
3674
3675 // B = clamp(1-E, 0, 13);
3676 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3677 One, E);
3678 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3679 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3680 DAG.getConstant(13, DL, MVT::i32));
3681
3682 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3683 DAG.getConstant(0x1000, DL, MVT::i32));
3684
3685 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3686 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3687 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3688 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3689
3690 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3691 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3692 DAG.getConstant(0x7, DL, MVT::i32));
3693 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3694 DAG.getConstant(2, DL, MVT::i32));
3695 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3696 One, Zero, ISD::SETEQ);
3697 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3698 One, Zero, ISD::SETGT);
3699 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3700 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3701
3702 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3703 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3704 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3705 I, V, ISD::SETEQ);
3706
3707 // Extract the sign bit.
3708 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3709 DAG.getConstant(16, DL, MVT::i32));
3710 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3711 DAG.getConstant(0x8000, DL, MVT::i32));
3712
3713 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3714}
3715
3717 SelectionDAG &DAG) const {
3718 SDValue Src = Op.getOperand(0);
3719 unsigned OpOpcode = Op.getOpcode();
3720 EVT SrcVT = Src.getValueType();
3721 EVT DestVT = Op.getValueType();
3722
3723 // Will be selected natively
3724 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3725 return Op;
3726
3727 if (SrcVT == MVT::bf16) {
3728 SDLoc DL(Op);
3729 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3730 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3731 }
3732
3733 // Promote i16 to i32
3734 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3735 SDLoc DL(Op);
3736
3737 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3738 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3739 }
3740
3741 if (DestVT != MVT::i64)
3742 return Op;
3743
3744 if (SrcVT == MVT::f16 ||
3745 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3746 SDLoc DL(Op);
3747
3748 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3749 unsigned Ext =
3751 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3752 }
3753
3754 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3755 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3756
3757 return SDValue();
3758}
3759
3761 SelectionDAG &DAG) const {
3762 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3763 MVT VT = Op.getSimpleValueType();
3764 MVT ScalarVT = VT.getScalarType();
3765
3766 assert(VT.isVector());
3767
3768 SDValue Src = Op.getOperand(0);
3769 SDLoc DL(Op);
3770
3771 // TODO: Don't scalarize on Evergreen?
3772 unsigned NElts = VT.getVectorNumElements();
3774 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3775
3776 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3777 for (unsigned I = 0; I < NElts; ++I)
3778 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3779
3780 return DAG.getBuildVector(VT, DL, Args);
3781}
3782
3783//===----------------------------------------------------------------------===//
3784// Custom DAG optimizations
3785//===----------------------------------------------------------------------===//
3786
3787static bool isU24(SDValue Op, SelectionDAG &DAG) {
3788 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3789}
3790
3791static bool isI24(SDValue Op, SelectionDAG &DAG) {
3792 EVT VT = Op.getValueType();
3793 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3794 // as unsigned 24-bit values.
3796}
3797
3800 SelectionDAG &DAG = DCI.DAG;
3801 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3802 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3803
3804 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3805 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3806 unsigned NewOpcode = Node24->getOpcode();
3807 if (IsIntrin) {
3808 unsigned IID = Node24->getConstantOperandVal(0);
3809 switch (IID) {
3810 case Intrinsic::amdgcn_mul_i24:
3811 NewOpcode = AMDGPUISD::MUL_I24;
3812 break;
3813 case Intrinsic::amdgcn_mul_u24:
3814 NewOpcode = AMDGPUISD::MUL_U24;
3815 break;
3816 case Intrinsic::amdgcn_mulhi_i24:
3817 NewOpcode = AMDGPUISD::MULHI_I24;
3818 break;
3819 case Intrinsic::amdgcn_mulhi_u24:
3820 NewOpcode = AMDGPUISD::MULHI_U24;
3821 break;
3822 default:
3823 llvm_unreachable("Expected 24-bit mul intrinsic");
3824 }
3825 }
3826
3827 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3828
3829 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3830 // the operands to have other uses, but will only perform simplifications that
3831 // involve bypassing some nodes for this user.
3832 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3833 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3834 if (DemandedLHS || DemandedRHS)
3835 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3836 DemandedLHS ? DemandedLHS : LHS,
3837 DemandedRHS ? DemandedRHS : RHS);
3838
3839 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3840 // operands if this node is the only user.
3841 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3842 return SDValue(Node24, 0);
3843 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3844 return SDValue(Node24, 0);
3845
3846 return SDValue();
3847}
3848
3849template <typename IntTy>
3851 uint32_t Width, const SDLoc &DL) {
3852 if (Width + Offset < 32) {
3853 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3854 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3855 if constexpr (std::is_signed_v<IntTy>) {
3856 return DAG.getSignedConstant(Result, DL, MVT::i32);
3857 } else {
3858 return DAG.getConstant(Result, DL, MVT::i32);
3859 }
3860 }
3861
3862 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3863}
3864
3865static bool hasVolatileUser(SDNode *Val) {
3866 for (SDNode *U : Val->users()) {
3867 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3868 if (M->isVolatile())
3869 return true;
3870 }
3871 }
3872
3873 return false;
3874}
3875
3877 // i32 vectors are the canonical memory type.
3878 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3879 return false;
3880
3881 if (!VT.isByteSized())
3882 return false;
3883
3884 unsigned Size = VT.getStoreSize();
3885
3886 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3887 return false;
3888
3889 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3890 return false;
3891
3892 return true;
3893}
3894
3895// Replace load of an illegal type with a bitcast from a load of a friendlier
3896// type.
3898 DAGCombinerInfo &DCI) const {
3899 if (!DCI.isBeforeLegalize())
3900 return SDValue();
3901
3903 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3904 return SDValue();
3905
3906 SDLoc SL(N);
3907 SelectionDAG &DAG = DCI.DAG;
3908 EVT VT = LN->getMemoryVT();
3909
3910 unsigned Size = VT.getStoreSize();
3911 Align Alignment = LN->getAlign();
3912 if (Alignment < Size && isTypeLegal(VT)) {
3913 unsigned IsFast;
3914 unsigned AS = LN->getAddressSpace();
3915
3916 // Expand unaligned loads earlier than legalization. Due to visitation order
3917 // problems during legalization, the emitted instructions to pack and unpack
3918 // the bytes again are not eliminated in the case of an unaligned copy.
3920 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3921 if (VT.isVector())
3922 return SplitVectorLoad(SDValue(LN, 0), DAG);
3923
3924 SDValue Ops[2];
3925 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3926
3927 return DAG.getMergeValues(Ops, SDLoc(N));
3928 }
3929
3930 if (!IsFast)
3931 return SDValue();
3932 }
3933
3934 if (!shouldCombineMemoryType(VT))
3935 return SDValue();
3936
3937 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3938
3939 SDValue NewLoad
3940 = DAG.getLoad(NewVT, SL, LN->getChain(),
3941 LN->getBasePtr(), LN->getMemOperand());
3942
3943 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3944 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3945 return SDValue(N, 0);
3946}
3947
3948// Replace store of an illegal type with a store of a bitcast to a friendlier
3949// type.
3951 DAGCombinerInfo &DCI) const {
3952 if (!DCI.isBeforeLegalize())
3953 return SDValue();
3954
3956 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3957 return SDValue();
3958
3959 EVT VT = SN->getMemoryVT();
3960 unsigned Size = VT.getStoreSize();
3961
3962 SDLoc SL(N);
3963 SelectionDAG &DAG = DCI.DAG;
3964 Align Alignment = SN->getAlign();
3965 if (Alignment < Size && isTypeLegal(VT)) {
3966 unsigned IsFast;
3967 unsigned AS = SN->getAddressSpace();
3968
3969 // Expand unaligned stores earlier than legalization. Due to visitation
3970 // order problems during legalization, the emitted instructions to pack and
3971 // unpack the bytes again are not eliminated in the case of an unaligned
3972 // copy.
3974 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3975 if (VT.isVector())
3976 return SplitVectorStore(SDValue(SN, 0), DAG);
3977
3978 return expandUnalignedStore(SN, DAG);
3979 }
3980
3981 if (!IsFast)
3982 return SDValue();
3983 }
3984
3985 if (!shouldCombineMemoryType(VT))
3986 return SDValue();
3987
3988 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3989 SDValue Val = SN->getValue();
3990
3991 //DCI.AddToWorklist(Val.getNode());
3992
3993 bool OtherUses = !Val.hasOneUse();
3994 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3995 if (OtherUses) {
3996 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3997 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3998 }
3999
4000 return DAG.getStore(SN->getChain(), SL, CastVal,
4001 SN->getBasePtr(), SN->getMemOperand());
4002}
4003
4004// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4005// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4006// issues.
4008 DAGCombinerInfo &DCI) const {
4009 SelectionDAG &DAG = DCI.DAG;
4010 SDValue N0 = N->getOperand(0);
4011
4012 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4013 // (vt2 (truncate (assertzext vt0:x, vt1)))
4014 if (N0.getOpcode() == ISD::TRUNCATE) {
4015 SDValue N1 = N->getOperand(1);
4016 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4017 SDLoc SL(N);
4018
4019 SDValue Src = N0.getOperand(0);
4020 EVT SrcVT = Src.getValueType();
4021 if (SrcVT.bitsGE(ExtVT)) {
4022 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4023 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4024 }
4025 }
4026
4027 return SDValue();
4028}
4029
4031 SDNode *N, DAGCombinerInfo &DCI) const {
4032 unsigned IID = N->getConstantOperandVal(0);
4033 switch (IID) {
4034 case Intrinsic::amdgcn_mul_i24:
4035 case Intrinsic::amdgcn_mul_u24:
4036 case Intrinsic::amdgcn_mulhi_i24:
4037 case Intrinsic::amdgcn_mulhi_u24:
4038 return simplifyMul24(N, DCI);
4039 case Intrinsic::amdgcn_fract:
4040 case Intrinsic::amdgcn_rsq:
4041 case Intrinsic::amdgcn_rcp_legacy:
4042 case Intrinsic::amdgcn_rsq_legacy:
4043 case Intrinsic::amdgcn_rsq_clamp:
4044 case Intrinsic::amdgcn_tanh:
4045 case Intrinsic::amdgcn_prng_b32: {
4046 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4047 SDValue Src = N->getOperand(1);
4048 return Src.isUndef() ? Src : SDValue();
4049 }
4050 case Intrinsic::amdgcn_frexp_exp: {
4051 // frexp_exp (fneg x) -> frexp_exp x
4052 // frexp_exp (fabs x) -> frexp_exp x
4053 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4054 SDValue Src = N->getOperand(1);
4055 SDValue PeekSign = peekFPSignOps(Src);
4056 if (PeekSign == Src)
4057 return SDValue();
4058 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4059 0);
4060 }
4061 default:
4062 return SDValue();
4063 }
4064}
4065
4066/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4067/// binary operation \p Opc to it with the corresponding constant operands.
4069 DAGCombinerInfo &DCI, const SDLoc &SL,
4070 unsigned Opc, SDValue LHS,
4071 uint32_t ValLo, uint32_t ValHi) const {
4072 SelectionDAG &DAG = DCI.DAG;
4073 SDValue Lo, Hi;
4074 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4075
4076 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4077 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4078
4079 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4080 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4081
4082 // Re-visit the ands. It's possible we eliminated one of them and it could
4083 // simplify the vector.
4084 DCI.AddToWorklist(Lo.getNode());
4085 DCI.AddToWorklist(Hi.getNode());
4086
4087 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4088 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4089}
4090
4092 DAGCombinerInfo &DCI) const {
4093 EVT VT = N->getValueType(0);
4094 SDValue LHS = N->getOperand(0);
4095 SDValue RHS = N->getOperand(1);
4097 SDLoc SL(N);
4098 SelectionDAG &DAG = DCI.DAG;
4099
4100 unsigned RHSVal;
4101 if (CRHS) {
4102 RHSVal = CRHS->getZExtValue();
4103 if (!RHSVal)
4104 return LHS;
4105
4106 switch (LHS->getOpcode()) {
4107 default:
4108 break;
4109 case ISD::ZERO_EXTEND:
4110 case ISD::SIGN_EXTEND:
4111 case ISD::ANY_EXTEND: {
4112 SDValue X = LHS->getOperand(0);
4113
4114 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4115 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4116 // Prefer build_vector as the canonical form if packed types are legal.
4117 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4118 SDValue Vec = DAG.getBuildVector(
4119 MVT::v2i16, SL,
4120 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4121 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4122 }
4123
4124 // shl (ext x) => zext (shl x), if shift does not overflow int
4125 if (VT != MVT::i64)
4126 break;
4127 KnownBits Known = DAG.computeKnownBits(X);
4128 unsigned LZ = Known.countMinLeadingZeros();
4129 if (LZ < RHSVal)
4130 break;
4131 EVT XVT = X.getValueType();
4132 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4133 return DAG.getZExtOrTrunc(Shl, SL, VT);
4134 }
4135 }
4136 }
4137
4138 if (VT.getScalarType() != MVT::i64)
4139 return SDValue();
4140
4141 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4142 // common case, splitting this into a move and a 32-bit shift is faster and
4143 // the same code size.
4144 KnownBits Known = DAG.computeKnownBits(RHS);
4145
4146 EVT ElementType = VT.getScalarType();
4147 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4148 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4149 : TargetScalarType;
4150
4151 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4152 return SDValue();
4153 SDValue ShiftAmt;
4154
4155 if (CRHS) {
4156 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4157 TargetType);
4158 } else {
4159 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4160 const SDValue ShiftMask =
4161 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4162 // This AND instruction will clamp out of bounds shift values.
4163 // It will also be removed during later instruction selection.
4164 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4165 }
4166
4167 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4168 SDValue NewShift =
4169 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4170
4171 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4172 SDValue Vec;
4173
4174 if (VT.isVector()) {
4175 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4176 unsigned NElts = TargetType.getVectorNumElements();
4178 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4179
4180 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4181 for (unsigned I = 0; I != NElts; ++I)
4182 HiAndLoOps[2 * I + 1] = HiOps[I];
4183 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4184 } else {
4185 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4186 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4187 }
4188 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4189}
4190
4192 DAGCombinerInfo &DCI) const {
4193 SDValue RHS = N->getOperand(1);
4195 EVT VT = N->getValueType(0);
4196 SDValue LHS = N->getOperand(0);
4197 SelectionDAG &DAG = DCI.DAG;
4198 SDLoc SL(N);
4199
4200 if (VT.getScalarType() != MVT::i64)
4201 return SDValue();
4202
4203 // For C >= 32
4204 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4205
4206 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4207 // common case, splitting this into a move and a 32-bit shift is faster and
4208 // the same code size.
4209 KnownBits Known = DAG.computeKnownBits(RHS);
4210
4211 EVT ElementType = VT.getScalarType();
4212 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4213 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4214 : TargetScalarType;
4215
4216 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4217 return SDValue();
4218
4219 SDValue ShiftFullAmt =
4220 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4221 SDValue ShiftAmt;
4222 if (CRHS) {
4223 unsigned RHSVal = CRHS->getZExtValue();
4224 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4225 TargetType);
4226 } else if (Known.getMinValue().getZExtValue() ==
4227 (ElementType.getSizeInBits() - 1)) {
4228 ShiftAmt = ShiftFullAmt;
4229 } else {
4230 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4231 const SDValue ShiftMask =
4232 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4233 // This AND instruction will clamp out of bounds shift values.
4234 // It will also be removed during later instruction selection.
4235 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4236 }
4237
4238 EVT ConcatType;
4239 SDValue Hi;
4240 SDLoc LHSSL(LHS);
4241 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4242 if (VT.isVector()) {
4243 unsigned NElts = TargetType.getVectorNumElements();
4244 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4245 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4246 SmallVector<SDValue, 8> HiOps(NElts);
4247 SmallVector<SDValue, 16> HiAndLoOps;
4248
4249 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4250 for (unsigned I = 0; I != NElts; ++I) {
4251 HiOps[I] = HiAndLoOps[2 * I + 1];
4252 }
4253 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4254 } else {
4255 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4256 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4257 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4258 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4259 }
4260
4261 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4262 SDValue HiShift;
4263 if (KnownLHS.isNegative()) {
4264 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4265 } else {
4266 Hi = DAG.getFreeze(Hi);
4267 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4268 }
4269 SDValue NewShift =
4270 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4271
4272 SDValue Vec;
4273 if (VT.isVector()) {
4274 unsigned NElts = TargetType.getVectorNumElements();
4277 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4278
4279 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4280 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4281 for (unsigned I = 0; I != NElts; ++I) {
4282 HiAndLoOps[2 * I + 1] = HiOps[I];
4283 HiAndLoOps[2 * I] = LoOps[I];
4284 }
4285 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4286 } else {
4287 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4288 }
4289 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4290}
4291
4293 DAGCombinerInfo &DCI) const {
4294 SDValue RHS = N->getOperand(1);
4296 EVT VT = N->getValueType(0);
4297 SDValue LHS = N->getOperand(0);
4298 SelectionDAG &DAG = DCI.DAG;
4299 SDLoc SL(N);
4300 unsigned RHSVal;
4301
4302 if (CRHS) {
4303 RHSVal = CRHS->getZExtValue();
4304
4305 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4306 // this improves the ability to match BFE patterns in isel.
4307 if (LHS.getOpcode() == ISD::AND) {
4308 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4309 unsigned MaskIdx, MaskLen;
4310 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4311 MaskIdx == RHSVal) {
4312 return DAG.getNode(ISD::AND, SL, VT,
4313 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4314 N->getOperand(1)),
4315 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4316 N->getOperand(1)));
4317 }
4318 }
4319 }
4320 }
4321
4322 if (VT.getScalarType() != MVT::i64)
4323 return SDValue();
4324
4325 // for C >= 32
4326 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4327
4328 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4329 // common case, splitting this into a move and a 32-bit shift is faster and
4330 // the same code size.
4331 KnownBits Known = DAG.computeKnownBits(RHS);
4332
4333 EVT ElementType = VT.getScalarType();
4334 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4335 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4336 : TargetScalarType;
4337
4338 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4339 return SDValue();
4340
4341 SDValue ShiftAmt;
4342 if (CRHS) {
4343 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4344 TargetType);
4345 } else {
4346 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4347 const SDValue ShiftMask =
4348 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4349 // This AND instruction will clamp out of bounds shift values.
4350 // It will also be removed during later instruction selection.
4351 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4352 }
4353
4354 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4355 EVT ConcatType;
4356 SDValue Hi;
4357 SDLoc LHSSL(LHS);
4358 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4359 if (VT.isVector()) {
4360 unsigned NElts = TargetType.getVectorNumElements();
4361 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4362 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4363 SmallVector<SDValue, 8> HiOps(NElts);
4364 SmallVector<SDValue, 16> HiAndLoOps;
4365
4366 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4367 for (unsigned I = 0; I != NElts; ++I)
4368 HiOps[I] = HiAndLoOps[2 * I + 1];
4369 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4370 } else {
4371 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4372 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4373 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4374 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4375 }
4376
4377 SDValue NewShift =
4378 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4379
4380 SDValue Vec;
4381 if (VT.isVector()) {
4382 unsigned NElts = TargetType.getVectorNumElements();
4384 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4385
4386 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4387 for (unsigned I = 0; I != NElts; ++I)
4388 HiAndLoOps[2 * I] = LoOps[I];
4389 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4390 } else {
4391 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4392 }
4393 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4394}
4395
4397 SDNode *N, DAGCombinerInfo &DCI) const {
4398 SDLoc SL(N);
4399 SelectionDAG &DAG = DCI.DAG;
4400 EVT VT = N->getValueType(0);
4401 SDValue Src = N->getOperand(0);
4402
4403 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4404 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4405 SDValue Vec = Src.getOperand(0);
4406 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4407 SDValue Elt0 = Vec.getOperand(0);
4408 EVT EltVT = Elt0.getValueType();
4409 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4410 if (EltVT.isFloatingPoint()) {
4411 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4412 EltVT.changeTypeToInteger(), Elt0);
4413 }
4414
4415 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4416 }
4417 }
4418 }
4419
4420 // Equivalent of above for accessing the high element of a vector as an
4421 // integer operation.
4422 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4423 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4424 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4425 SDValue BV = stripBitcast(Src.getOperand(0));
4426 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4427 EVT SrcEltVT = BV.getOperand(0).getValueType();
4428 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4429 unsigned BitIndex = K->getZExtValue();
4430 unsigned PartIndex = BitIndex / SrcEltSize;
4431
4432 if (PartIndex * SrcEltSize == BitIndex &&
4433 PartIndex < BV.getNumOperands()) {
4434 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4435 SDValue SrcElt =
4436 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4437 BV.getOperand(PartIndex));
4438 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4439 }
4440 }
4441 }
4442 }
4443 }
4444
4445 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4446 //
4447 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4448 // i16 (trunc (srl (i32 (trunc x), K)))
4449 if (VT.getScalarSizeInBits() < 32) {
4450 EVT SrcVT = Src.getValueType();
4451 if (SrcVT.getScalarSizeInBits() > 32 &&
4452 (Src.getOpcode() == ISD::SRL ||
4453 Src.getOpcode() == ISD::SRA ||
4454 Src.getOpcode() == ISD::SHL)) {
4455 SDValue Amt = Src.getOperand(1);
4456 KnownBits Known = DAG.computeKnownBits(Amt);
4457
4458 // - For left shifts, do the transform as long as the shift
4459 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4460 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4461 // losing information stored in the high bits when truncating.
4462 const unsigned MaxCstSize =
4463 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4464 if (Known.getMaxValue().ule(MaxCstSize)) {
4465 EVT MidVT = VT.isVector() ?
4466 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4467 VT.getVectorNumElements()) : MVT::i32;
4468
4469 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4470 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4471 Src.getOperand(0));
4472 DCI.AddToWorklist(Trunc.getNode());
4473
4474 if (Amt.getValueType() != NewShiftVT) {
4475 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4476 DCI.AddToWorklist(Amt.getNode());
4477 }
4478
4479 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4480 Trunc, Amt);
4481 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4482 }
4483 }
4484 }
4485
4486 return SDValue();
4487}
4488
4489// We need to specifically handle i64 mul here to avoid unnecessary conversion
4490// instructions. If we only match on the legalized i64 mul expansion,
4491// SimplifyDemandedBits will be unable to remove them because there will be
4492// multiple uses due to the separate mul + mulh[su].
4493static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4494 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4495 if (Size <= 32) {
4496 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4497 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4498 }
4499
4500 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4501 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4502
4503 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4504 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4505
4506 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4507}
4508
4509/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4510/// return SDValue().
4511static SDValue getAddOneOp(const SDNode *V) {
4512 if (V->getOpcode() != ISD::ADD)
4513 return SDValue();
4514
4515 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4516}
4517
4519 DAGCombinerInfo &DCI) const {
4520 assert(N->getOpcode() == ISD::MUL);
4521 EVT VT = N->getValueType(0);
4522
4523 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4524 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4525 // unnecessarily). isDivergent() is used as an approximation of whether the
4526 // value is in an SGPR.
4527 if (!N->isDivergent())
4528 return SDValue();
4529
4530 unsigned Size = VT.getSizeInBits();
4531 if (VT.isVector() || Size > 64)
4532 return SDValue();
4533
4534 SelectionDAG &DAG = DCI.DAG;
4535 SDLoc DL(N);
4536
4537 SDValue N0 = N->getOperand(0);
4538 SDValue N1 = N->getOperand(1);
4539
4540 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4541 // matching.
4542
4543 // mul x, (add y, 1) -> add (mul x, y), x
4544 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4545 SDValue AddOp = getAddOneOp(V.getNode());
4546 if (!AddOp)
4547 return SDValue();
4548
4549 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4550 return U->getOpcode() == ISD::MUL;
4551 }))
4552 return AddOp;
4553
4554 return SDValue();
4555 };
4556
4557 // FIXME: The selection pattern is not properly checking for commuted
4558 // operands, so we have to place the mul in the LHS
4559 if (SDValue MulOper = IsFoldableAdd(N0)) {
4560 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4561 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4562 }
4563
4564 if (SDValue MulOper = IsFoldableAdd(N1)) {
4565 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4566 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4567 }
4568
4569 // There are i16 integer mul/mad.
4570 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4571 return SDValue();
4572
4573 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4574 // in the source into any_extends if the result of the mul is truncated. Since
4575 // we can assume the high bits are whatever we want, use the underlying value
4576 // to avoid the unknown high bits from interfering.
4577 if (N0.getOpcode() == ISD::ANY_EXTEND)
4578 N0 = N0.getOperand(0);
4579
4580 if (N1.getOpcode() == ISD::ANY_EXTEND)
4581 N1 = N1.getOperand(0);
4582
4583 SDValue Mul;
4584
4585 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4586 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4587 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4588 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4589 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4590 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4591 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4592 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4593 } else {
4594 return SDValue();
4595 }
4596
4597 // We need to use sext even for MUL_U24, because MUL_U24 is used
4598 // for signed multiply of 8 and 16-bit types.
4599 return DAG.getSExtOrTrunc(Mul, DL, VT);
4600}
4601
4602SDValue
4604 DAGCombinerInfo &DCI) const {
4605 if (N->getValueType(0) != MVT::i32)
4606 return SDValue();
4607
4608 SelectionDAG &DAG = DCI.DAG;
4609 SDLoc DL(N);
4610
4611 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4612 SDValue N0 = N->getOperand(0);
4613 SDValue N1 = N->getOperand(1);
4614
4615 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4616 // in the source into any_extends if the result of the mul is truncated. Since
4617 // we can assume the high bits are whatever we want, use the underlying value
4618 // to avoid the unknown high bits from interfering.
4619 if (N0.getOpcode() == ISD::ANY_EXTEND)
4620 N0 = N0.getOperand(0);
4621 if (N1.getOpcode() == ISD::ANY_EXTEND)
4622 N1 = N1.getOperand(0);
4623
4624 // Try to use two fast 24-bit multiplies (one for each half of the result)
4625 // instead of one slow extending multiply.
4626 unsigned LoOpcode = 0;
4627 unsigned HiOpcode = 0;
4628 if (Signed) {
4629 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4630 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4631 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4632 LoOpcode = AMDGPUISD::MUL_I24;
4633 HiOpcode = AMDGPUISD::MULHI_I24;
4634 }
4635 } else {
4636 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4637 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4638 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4639 LoOpcode = AMDGPUISD::MUL_U24;
4640 HiOpcode = AMDGPUISD::MULHI_U24;
4641 }
4642 }
4643 if (!LoOpcode)
4644 return SDValue();
4645
4646 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4647 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4648 DCI.CombineTo(N, Lo, Hi);
4649 return SDValue(N, 0);
4650}
4651
4653 DAGCombinerInfo &DCI) const {
4654 EVT VT = N->getValueType(0);
4655
4656 if (!Subtarget->hasMulI24() || VT.isVector())
4657 return SDValue();
4658
4659 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4660 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4661 // unnecessarily). isDivergent() is used as an approximation of whether the
4662 // value is in an SGPR.
4663 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4664 // valu op anyway)
4665 if (Subtarget->hasSMulHi() && !N->isDivergent())
4666 return SDValue();
4667
4668 SelectionDAG &DAG = DCI.DAG;
4669 SDLoc DL(N);
4670
4671 SDValue N0 = N->getOperand(0);
4672 SDValue N1 = N->getOperand(1);
4673
4674 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4675 return SDValue();
4676
4677 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4678 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4679
4680 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4681 DCI.AddToWorklist(Mulhi.getNode());
4682 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4683}
4684
4686 DAGCombinerInfo &DCI) const {
4687 EVT VT = N->getValueType(0);
4688
4689 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4690 return SDValue();
4691
4692 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4693 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4694 // unnecessarily). isDivergent() is used as an approximation of whether the
4695 // value is in an SGPR.
4696 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4697 // valu op anyway)
4698 if (Subtarget->hasSMulHi() && !N->isDivergent())
4699 return SDValue();
4700
4701 SelectionDAG &DAG = DCI.DAG;
4702 SDLoc DL(N);
4703
4704 SDValue N0 = N->getOperand(0);
4705 SDValue N1 = N->getOperand(1);
4706
4707 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4708 return SDValue();
4709
4710 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4711 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4712
4713 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4714 DCI.AddToWorklist(Mulhi.getNode());
4715 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4716}
4717
4718SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4719 SDValue Op,
4720 const SDLoc &DL,
4721 unsigned Opc) const {
4722 EVT VT = Op.getValueType();
4723 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4724 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4725 LegalVT != MVT::i16))
4726 return SDValue();
4727
4728 if (VT != MVT::i32)
4729 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4730
4731 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4732 if (VT != MVT::i32)
4733 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4734
4735 return FFBX;
4736}
4737
4738// The native instructions return -1 on 0 input. Optimize out a select that
4739// produces -1 on 0.
4740//
4741// TODO: If zero is not undef, we could also do this if the output is compared
4742// against the bitwidth.
4743//
4744// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4746 SDValue LHS, SDValue RHS,
4747 DAGCombinerInfo &DCI) const {
4748 if (!isNullConstant(Cond.getOperand(1)))
4749 return SDValue();
4750
4751 SelectionDAG &DAG = DCI.DAG;
4752 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4753 SDValue CmpLHS = Cond.getOperand(0);
4754
4755 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4756 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4757 if (CCOpcode == ISD::SETEQ &&
4758 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4759 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4760 unsigned Opc =
4761 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4762 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4763 }
4764
4765 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4766 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4767 if (CCOpcode == ISD::SETNE &&
4768 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4769 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4770 unsigned Opc =
4771 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4772
4773 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4774 }
4775
4776 return SDValue();
4777}
4778
4780 unsigned Op,
4781 const SDLoc &SL,
4782 SDValue Cond,
4783 SDValue N1,
4784 SDValue N2) {
4785 SelectionDAG &DAG = DCI.DAG;
4786 EVT VT = N1.getValueType();
4787
4788 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4789 N1.getOperand(0), N2.getOperand(0));
4790 DCI.AddToWorklist(NewSelect.getNode());
4791 return DAG.getNode(Op, SL, VT, NewSelect);
4792}
4793
4794// Pull a free FP operation out of a select so it may fold into uses.
4795//
4796// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4797// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4798//
4799// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4800// select c, (fabs x), +k -> fabs (select c, x, k)
4801SDValue
4803 SDValue N) const {
4804 SelectionDAG &DAG = DCI.DAG;
4805 SDValue Cond = N.getOperand(0);
4806 SDValue LHS = N.getOperand(1);
4807 SDValue RHS = N.getOperand(2);
4808
4809 EVT VT = N.getValueType();
4810 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4811 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4813 return SDValue();
4814
4815 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4816 SDLoc(N), Cond, LHS, RHS);
4817 }
4818
4819 bool Inv = false;
4820 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4821 std::swap(LHS, RHS);
4822 Inv = true;
4823 }
4824
4825 // TODO: Support vector constants.
4827 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4828 !selectSupportsSourceMods(N.getNode())) {
4829 SDLoc SL(N);
4830 // If one side is an fneg/fabs and the other is a constant, we can push the
4831 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4832 SDValue NewLHS = LHS.getOperand(0);
4833 SDValue NewRHS = RHS;
4834
4835 // Careful: if the neg can be folded up, don't try to pull it back down.
4836 bool ShouldFoldNeg = true;
4837
4838 if (NewLHS.hasOneUse()) {
4839 unsigned Opc = NewLHS.getOpcode();
4840 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4841 ShouldFoldNeg = false;
4842 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4843 ShouldFoldNeg = false;
4844 }
4845
4846 if (ShouldFoldNeg) {
4847 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4848 return SDValue();
4849
4850 // We're going to be forced to use a source modifier anyway, there's no
4851 // point to pulling the negate out unless we can get a size reduction by
4852 // negating the constant.
4853 //
4854 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4855 // about cheaper constants.
4856 if (NewLHS.getOpcode() == ISD::FABS &&
4858 return SDValue();
4859
4861 return SDValue();
4862
4863 if (LHS.getOpcode() == ISD::FNEG)
4864 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4865
4866 if (Inv)
4867 std::swap(NewLHS, NewRHS);
4868
4869 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4870 Cond, NewLHS, NewRHS);
4871 DCI.AddToWorklist(NewSelect.getNode());
4872 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4873 }
4874 }
4875
4876 return SDValue();
4877}
4878
4880 DAGCombinerInfo &DCI) const {
4881 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4882 return Folded;
4883
4884 SDValue Cond = N->getOperand(0);
4885 if (Cond.getOpcode() != ISD::SETCC)
4886 return SDValue();
4887
4888 EVT VT = N->getValueType(0);
4889 SDValue LHS = Cond.getOperand(0);
4890 SDValue RHS = Cond.getOperand(1);
4891 SDValue CC = Cond.getOperand(2);
4892
4893 SDValue True = N->getOperand(1);
4894 SDValue False = N->getOperand(2);
4895
4896 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4897 SelectionDAG &DAG = DCI.DAG;
4898 if (DAG.isConstantValueOfAnyType(True) &&
4899 !DAG.isConstantValueOfAnyType(False)) {
4900 // Swap cmp + select pair to move constant to false input.
4901 // This will allow using VOPC cndmasks more often.
4902 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4903
4904 SDLoc SL(N);
4905 ISD::CondCode NewCC =
4906 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4907
4908 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4909 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4910 }
4911
4912 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4914 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4915 // Revisit this node so we can catch min3/max3/med3 patterns.
4916 //DCI.AddToWorklist(MinMax.getNode());
4917 return MinMax;
4918 }
4919 }
4920
4921 // There's no reason to not do this if the condition has other uses.
4922 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4923}
4924
4925static bool isInv2Pi(const APFloat &APF) {
4926 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4927 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4928 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4929
4930 return APF.bitwiseIsEqual(KF16) ||
4931 APF.bitwiseIsEqual(KF32) ||
4932 APF.bitwiseIsEqual(KF64);
4933}
4934
4935// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4936// additional cost to negate them.
4939 if (C->isZero())
4940 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4941
4942 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4943 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4944
4946}
4947
4953
4959
4960static unsigned inverseMinMax(unsigned Opc) {
4961 switch (Opc) {
4962 case ISD::FMAXNUM:
4963 return ISD::FMINNUM;
4964 case ISD::FMINNUM:
4965 return ISD::FMAXNUM;
4966 case ISD::FMAXNUM_IEEE:
4967 return ISD::FMINNUM_IEEE;
4968 case ISD::FMINNUM_IEEE:
4969 return ISD::FMAXNUM_IEEE;
4970 case ISD::FMAXIMUM:
4971 return ISD::FMINIMUM;
4972 case ISD::FMINIMUM:
4973 return ISD::FMAXIMUM;
4974 case ISD::FMAXIMUMNUM:
4975 return ISD::FMINIMUMNUM;
4976 case ISD::FMINIMUMNUM:
4977 return ISD::FMAXIMUMNUM;
4978 case AMDGPUISD::FMAX_LEGACY:
4979 return AMDGPUISD::FMIN_LEGACY;
4980 case AMDGPUISD::FMIN_LEGACY:
4981 return AMDGPUISD::FMAX_LEGACY;
4982 default:
4983 llvm_unreachable("invalid min/max opcode");
4984 }
4985}
4986
4987/// \return true if it's profitable to try to push an fneg into its source
4988/// instruction.
4990 // If the input has multiple uses and we can either fold the negate down, or
4991 // the other uses cannot, give up. This both prevents unprofitable
4992 // transformations and infinite loops: we won't repeatedly try to fold around
4993 // a negate that has no 'good' form.
4994 if (N0.hasOneUse()) {
4995 // This may be able to fold into the source, but at a code size cost. Don't
4996 // fold if the fold into the user is free.
4997 if (allUsesHaveSourceMods(N, 0))
4998 return false;
4999 } else {
5000 if (fnegFoldsIntoOp(N0.getNode()) &&
5002 return false;
5003 }
5004
5005 return true;
5006}
5007
5009 DAGCombinerInfo &DCI) const {
5010 SelectionDAG &DAG = DCI.DAG;
5011 SDValue N0 = N->getOperand(0);
5012 EVT VT = N->getValueType(0);
5013
5014 unsigned Opc = N0.getOpcode();
5015
5016 if (!shouldFoldFNegIntoSrc(N, N0))
5017 return SDValue();
5018
5019 SDLoc SL(N);
5020 switch (Opc) {
5021 case ISD::FADD: {
5022 if (!mayIgnoreSignedZero(N0))
5023 return SDValue();
5024
5025 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5026 SDValue LHS = N0.getOperand(0);
5027 SDValue RHS = N0.getOperand(1);
5028
5029 if (LHS.getOpcode() != ISD::FNEG)
5030 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5031 else
5032 LHS = LHS.getOperand(0);
5033
5034 if (RHS.getOpcode() != ISD::FNEG)
5035 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5036 else
5037 RHS = RHS.getOperand(0);
5038
5039 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5040 if (Res.getOpcode() != ISD::FADD)
5041 return SDValue(); // Op got folded away.
5042 if (!N0.hasOneUse())
5043 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5044 return Res;
5045 }
5046 case ISD::FMUL:
5047 case AMDGPUISD::FMUL_LEGACY: {
5048 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5049 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5050 SDValue LHS = N0.getOperand(0);
5051 SDValue RHS = N0.getOperand(1);
5052
5053 if (LHS.getOpcode() == ISD::FNEG)
5054 LHS = LHS.getOperand(0);
5055 else if (RHS.getOpcode() == ISD::FNEG)
5056 RHS = RHS.getOperand(0);
5057 else
5058 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5059
5060 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5061 if (Res.getOpcode() != Opc)
5062 return SDValue(); // Op got folded away.
5063 if (!N0.hasOneUse())
5064 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5065 return Res;
5066 }
5067 case ISD::FMA:
5068 case ISD::FMAD: {
5069 // TODO: handle llvm.amdgcn.fma.legacy
5070 if (!mayIgnoreSignedZero(N0))
5071 return SDValue();
5072
5073 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5074 SDValue LHS = N0.getOperand(0);
5075 SDValue MHS = N0.getOperand(1);
5076 SDValue RHS = N0.getOperand(2);
5077
5078 if (LHS.getOpcode() == ISD::FNEG)
5079 LHS = LHS.getOperand(0);
5080 else if (MHS.getOpcode() == ISD::FNEG)
5081 MHS = MHS.getOperand(0);
5082 else
5083 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5084
5085 if (RHS.getOpcode() != ISD::FNEG)
5086 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5087 else
5088 RHS = RHS.getOperand(0);
5089
5090 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5091 if (Res.getOpcode() != Opc)
5092 return SDValue(); // Op got folded away.
5093 if (!N0.hasOneUse())
5094 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5095 return Res;
5096 }
5097 case ISD::FMAXNUM:
5098 case ISD::FMINNUM:
5099 case ISD::FMAXNUM_IEEE:
5100 case ISD::FMINNUM_IEEE:
5101 case ISD::FMINIMUM:
5102 case ISD::FMAXIMUM:
5103 case ISD::FMINIMUMNUM:
5104 case ISD::FMAXIMUMNUM:
5105 case AMDGPUISD::FMAX_LEGACY:
5106 case AMDGPUISD::FMIN_LEGACY: {
5107 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5108 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5109 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5110 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5111
5112 SDValue LHS = N0.getOperand(0);
5113 SDValue RHS = N0.getOperand(1);
5114
5115 // 0 doesn't have a negated inline immediate.
5116 // TODO: This constant check should be generalized to other operations.
5118 return SDValue();
5119
5120 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5121 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5122 unsigned Opposite = inverseMinMax(Opc);
5123
5124 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5125 if (Res.getOpcode() != Opposite)
5126 return SDValue(); // Op got folded away.
5127 if (!N0.hasOneUse())
5128 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5129 return Res;
5130 }
5131 case AMDGPUISD::FMED3: {
5132 SDValue Ops[3];
5133 for (unsigned I = 0; I < 3; ++I)
5134 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5135
5136 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5137 if (Res.getOpcode() != AMDGPUISD::FMED3)
5138 return SDValue(); // Op got folded away.
5139
5140 if (!N0.hasOneUse()) {
5141 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5142 DAG.ReplaceAllUsesWith(N0, Neg);
5143
5144 for (SDNode *U : Neg->users())
5145 DCI.AddToWorklist(U);
5146 }
5147
5148 return Res;
5149 }
5150 case ISD::FP_EXTEND:
5151 case ISD::FTRUNC:
5152 case ISD::FRINT:
5153 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5154 case ISD::FROUNDEVEN:
5155 case ISD::FSIN:
5156 case ISD::FCANONICALIZE:
5157 case AMDGPUISD::RCP:
5158 case AMDGPUISD::RCP_LEGACY:
5159 case AMDGPUISD::RCP_IFLAG:
5160 case AMDGPUISD::SIN_HW: {
5161 SDValue CvtSrc = N0.getOperand(0);
5162 if (CvtSrc.getOpcode() == ISD::FNEG) {
5163 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5164 // (fneg (rcp (fneg x))) -> (rcp x)
5165 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5166 }
5167
5168 if (!N0.hasOneUse())
5169 return SDValue();
5170
5171 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5172 // (fneg (rcp x)) -> (rcp (fneg x))
5173 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5174 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5175 }
5176 case ISD::FP_ROUND: {
5177 SDValue CvtSrc = N0.getOperand(0);
5178
5179 if (CvtSrc.getOpcode() == ISD::FNEG) {
5180 // (fneg (fp_round (fneg x))) -> (fp_round x)
5181 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5182 CvtSrc.getOperand(0), N0.getOperand(1));
5183 }
5184
5185 if (!N0.hasOneUse())
5186 return SDValue();
5187
5188 // (fneg (fp_round x)) -> (fp_round (fneg x))
5189 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5190 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5191 }
5192 case ISD::FP16_TO_FP: {
5193 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5194 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5195 // Put the fneg back as a legal source operation that can be matched later.
5196 SDLoc SL(N);
5197
5198 SDValue Src = N0.getOperand(0);
5199 EVT SrcVT = Src.getValueType();
5200
5201 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5202 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5203 DAG.getConstant(0x8000, SL, SrcVT));
5204 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5205 }
5206 case ISD::SELECT: {
5207 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5208 // TODO: Invert conditions of foldFreeOpFromSelect
5209 return SDValue();
5210 }
5211 case ISD::BITCAST: {
5212 SDLoc SL(N);
5213 SDValue BCSrc = N0.getOperand(0);
5214 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5215 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5216 if (HighBits.getValueType().getSizeInBits() != 32 ||
5217 !fnegFoldsIntoOp(HighBits.getNode()))
5218 return SDValue();
5219
5220 // f64 fneg only really needs to operate on the high half of of the
5221 // register, so try to force it to an f32 operation to help make use of
5222 // source modifiers.
5223 //
5224 //
5225 // fneg (f64 (bitcast (build_vector x, y))) ->
5226 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5227 // (fneg (bitcast i32:y to f32)))
5228
5229 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5230 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5231 SDValue CastBack =
5232 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5233
5235 Ops.back() = CastBack;
5236 DCI.AddToWorklist(NegHi.getNode());
5237 SDValue Build =
5238 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5239 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5240
5241 if (!N0.hasOneUse())
5242 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5243 return Result;
5244 }
5245
5246 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5247 BCSrc.hasOneUse()) {
5248 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5249 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5250
5251 // TODO: Cast back result for multiple uses is beneficial in some cases.
5252
5253 SDValue LHS =
5254 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5255 SDValue RHS =
5256 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5257
5258 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5259 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5260
5261 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5262 NegRHS);
5263 }
5264
5265 return SDValue();
5266 }
5267 default:
5268 return SDValue();
5269 }
5270}
5271
5273 DAGCombinerInfo &DCI) const {
5274 SelectionDAG &DAG = DCI.DAG;
5275 SDValue N0 = N->getOperand(0);
5276
5277 if (!N0.hasOneUse())
5278 return SDValue();
5279
5280 switch (N0.getOpcode()) {
5281 case ISD::FP16_TO_FP: {
5282 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5283 SDLoc SL(N);
5284 SDValue Src = N0.getOperand(0);
5285 EVT SrcVT = Src.getValueType();
5286
5287 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5288 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5289 DAG.getConstant(0x7fff, SL, SrcVT));
5290 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5291 }
5292 default:
5293 return SDValue();
5294 }
5295}
5296
5298 DAGCombinerInfo &DCI) const {
5299 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5300 if (!CFP)
5301 return SDValue();
5302
5303 // XXX - Should this flush denormals?
5304 const APFloat &Val = CFP->getValueAPF();
5305 APFloat One(Val.getSemantics(), "1.0");
5306 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5307}
5308
5310 DAGCombinerInfo &DCI) const {
5311 SelectionDAG &DAG = DCI.DAG;
5312 SDLoc DL(N);
5313
5314 switch(N->getOpcode()) {
5315 default:
5316 break;
5317 case ISD::BITCAST: {
5318 EVT DestVT = N->getValueType(0);
5319
5320 // Push casts through vector builds. This helps avoid emitting a large
5321 // number of copies when materializing floating point vector constants.
5322 //
5323 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5324 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5325 if (DestVT.isVector()) {
5326 SDValue Src = N->getOperand(0);
5327 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5330 EVT SrcVT = Src.getValueType();
5331 unsigned NElts = DestVT.getVectorNumElements();
5332
5333 if (SrcVT.getVectorNumElements() == NElts) {
5334 EVT DestEltVT = DestVT.getVectorElementType();
5335
5336 SmallVector<SDValue, 8> CastedElts;
5337 SDLoc SL(N);
5338 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5339 SDValue Elt = Src.getOperand(I);
5340 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5341 }
5342
5343 return DAG.getBuildVector(DestVT, SL, CastedElts);
5344 }
5345 }
5346 }
5347
5348 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5349 break;
5350
5351 // Fold bitcasts of constants.
5352 //
5353 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5354 // TODO: Generalize and move to DAGCombiner
5355 SDValue Src = N->getOperand(0);
5357 SDLoc SL(N);
5358 uint64_t CVal = C->getZExtValue();
5359 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5360 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5361 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5362 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5363 }
5364
5366 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5367 SDLoc SL(N);
5368 uint64_t CVal = Val.getZExtValue();
5369 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5370 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5371 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5372
5373 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5374 }
5375
5376 break;
5377 }
5378 case ISD::SHL:
5379 case ISD::SRA:
5380 case ISD::SRL: {
5381 // Range metadata can be invalidated when loads are converted to legal types
5382 // (e.g. v2i64 -> v4i32).
5383 // Try to convert vector shl/sra/srl before type legalization so that range
5384 // metadata can be utilized.
5385 if (!(N->getValueType(0).isVector() &&
5388 break;
5389 if (N->getOpcode() == ISD::SHL)
5390 return performShlCombine(N, DCI);
5391 if (N->getOpcode() == ISD::SRA)
5392 return performSraCombine(N, DCI);
5393 return performSrlCombine(N, DCI);
5394 }
5395 case ISD::TRUNCATE:
5396 return performTruncateCombine(N, DCI);
5397 case ISD::MUL:
5398 return performMulCombine(N, DCI);
5399 case AMDGPUISD::MUL_U24:
5400 case AMDGPUISD::MUL_I24: {
5401 if (SDValue Simplified = simplifyMul24(N, DCI))
5402 return Simplified;
5403 break;
5404 }
5405 case AMDGPUISD::MULHI_I24:
5406 case AMDGPUISD::MULHI_U24:
5407 return simplifyMul24(N, DCI);
5408 case ISD::SMUL_LOHI:
5409 case ISD::UMUL_LOHI:
5410 return performMulLoHiCombine(N, DCI);
5411 case ISD::MULHS:
5412 return performMulhsCombine(N, DCI);
5413 case ISD::MULHU:
5414 return performMulhuCombine(N, DCI);
5415 case ISD::SELECT:
5416 return performSelectCombine(N, DCI);
5417 case ISD::FNEG:
5418 return performFNegCombine(N, DCI);
5419 case ISD::FABS:
5420 return performFAbsCombine(N, DCI);
5421 case AMDGPUISD::BFE_I32:
5422 case AMDGPUISD::BFE_U32: {
5423 assert(!N->getValueType(0).isVector() &&
5424 "Vector handling of BFE not implemented");
5425 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5426 if (!Width)
5427 break;
5428
5429 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5430 if (WidthVal == 0)
5431 return DAG.getConstant(0, DL, MVT::i32);
5432
5434 if (!Offset)
5435 break;
5436
5437 SDValue BitsFrom = N->getOperand(0);
5438 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5439
5440 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5441
5442 if (OffsetVal == 0) {
5443 // This is already sign / zero extended, so try to fold away extra BFEs.
5444 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5445
5446 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5447 if (OpSignBits >= SignBits)
5448 return BitsFrom;
5449
5450 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5451 if (Signed) {
5452 // This is a sign_extend_inreg. Replace it to take advantage of existing
5453 // DAG Combines. If not eliminated, we will match back to BFE during
5454 // selection.
5455
5456 // TODO: The sext_inreg of extended types ends, although we can could
5457 // handle them in a single BFE.
5458 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5459 DAG.getValueType(SmallVT));
5460 }
5461
5462 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5463 }
5464
5465 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5466 if (Signed) {
5467 return constantFoldBFE<int32_t>(DAG,
5468 CVal->getSExtValue(),
5469 OffsetVal,
5470 WidthVal,
5471 DL);
5472 }
5473
5474 return constantFoldBFE<uint32_t>(DAG,
5475 CVal->getZExtValue(),
5476 OffsetVal,
5477 WidthVal,
5478 DL);
5479 }
5480
5481 if ((OffsetVal + WidthVal) >= 32 &&
5482 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5483 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5484 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5485 BitsFrom, ShiftVal);
5486 }
5487
5488 if (BitsFrom.hasOneUse()) {
5489 APInt Demanded = APInt::getBitsSet(32,
5490 OffsetVal,
5491 OffsetVal + WidthVal);
5492
5493 KnownBits Known;
5495 !DCI.isBeforeLegalizeOps());
5496 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5497 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5498 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5499 DCI.CommitTargetLoweringOpt(TLO);
5500 }
5501 }
5502
5503 break;
5504 }
5505 case ISD::LOAD:
5506 return performLoadCombine(N, DCI);
5507 case ISD::STORE:
5508 return performStoreCombine(N, DCI);
5509 case AMDGPUISD::RCP:
5510 case AMDGPUISD::RCP_IFLAG:
5511 return performRcpCombine(N, DCI);
5512 case ISD::AssertZext:
5513 case ISD::AssertSext:
5514 return performAssertSZExtCombine(N, DCI);
5516 return performIntrinsicWOChainCombine(N, DCI);
5517 case AMDGPUISD::FMAD_FTZ: {
5518 SDValue N0 = N->getOperand(0);
5519 SDValue N1 = N->getOperand(1);
5520 SDValue N2 = N->getOperand(2);
5521 EVT VT = N->getValueType(0);
5522
5523 // FMAD_FTZ is a FMAD + flush denormals to zero.
5524 // We flush the inputs, the intermediate step, and the output.
5528 if (N0CFP && N1CFP && N2CFP) {
5529 const auto FTZ = [](const APFloat &V) {
5530 if (V.isDenormal()) {
5531 APFloat Zero(V.getSemantics(), 0);
5532 return V.isNegative() ? -Zero : Zero;
5533 }
5534 return V;
5535 };
5536
5537 APFloat V0 = FTZ(N0CFP->getValueAPF());
5538 APFloat V1 = FTZ(N1CFP->getValueAPF());
5539 APFloat V2 = FTZ(N2CFP->getValueAPF());
5541 V0 = FTZ(V0);
5543 return DAG.getConstantFP(FTZ(V0), DL, VT);
5544 }
5545 break;
5546 }
5547 }
5548 return SDValue();
5549}
5550
5551//===----------------------------------------------------------------------===//
5552// Helper functions
5553//===----------------------------------------------------------------------===//
5554
5556 const TargetRegisterClass *RC,
5557 Register Reg, EVT VT,
5558 const SDLoc &SL,
5559 bool RawReg) const {
5562 Register VReg;
5563
5564 if (!MRI.isLiveIn(Reg)) {
5565 VReg = MRI.createVirtualRegister(RC);
5566 MRI.addLiveIn(Reg, VReg);
5567 } else {
5568 VReg = MRI.getLiveInVirtReg(Reg);
5569 }
5570
5571 if (RawReg)
5572 return DAG.getRegister(VReg, VT);
5573
5574 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5575}
5576
5577// This may be called multiple times, and nothing prevents creating multiple
5578// objects at the same offset. See if we already defined this object.
5580 int64_t Offset) {
5581 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5582 if (MFI.getObjectOffset(I) == Offset) {
5583 assert(MFI.getObjectSize(I) == Size);
5584 return I;
5585 }
5586 }
5587
5588 return MFI.CreateFixedObject(Size, Offset, true);
5589}
5590
5592 EVT VT,
5593 const SDLoc &SL,
5594 int64_t Offset) const {
5596 MachineFrameInfo &MFI = MF.getFrameInfo();
5597 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5598
5599 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5600 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5601
5602 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5605}
5606
5608 const SDLoc &SL,
5609 SDValue Chain,
5610 SDValue ArgVal,
5611 int64_t Offset) const {
5615
5616 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5617 // Stores to the argument stack area are relative to the stack pointer.
5618 SDValue SP =
5619 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5620 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5621 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5623 return Store;
5624}
5625
5627 const TargetRegisterClass *RC,
5628 EVT VT, const SDLoc &SL,
5629 const ArgDescriptor &Arg) const {
5630 assert(Arg && "Attempting to load missing argument");
5631
5632 SDValue V = Arg.isRegister() ?
5633 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5634 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5635
5636 if (!Arg.isMasked())
5637 return V;
5638
5639 unsigned Mask = Arg.getMask();
5640 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5641 V = DAG.getNode(ISD::SRL, SL, VT, V,
5642 DAG.getShiftAmountConstant(Shift, VT, SL));
5643 return DAG.getNode(ISD::AND, SL, VT, V,
5644 DAG.getConstant(Mask >> Shift, SL, VT));
5645}
5646
5648 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5649 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5650 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5651 uint64_t ArgOffset =
5652 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5653 switch (Param) {
5654 case FIRST_IMPLICIT:
5655 return ArgOffset;
5656 case PRIVATE_BASE:
5658 case SHARED_BASE:
5659 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5660 case QUEUE_PTR:
5661 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5662 }
5663 llvm_unreachable("unexpected implicit parameter type");
5664}
5665
5671
5673 SelectionDAG &DAG, int Enabled,
5674 int &RefinementSteps,
5675 bool &UseOneConstNR,
5676 bool Reciprocal) const {
5677 EVT VT = Operand.getValueType();
5678
5679 if (VT == MVT::f32) {
5680 RefinementSteps = 0;
5681 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5682 }
5683
5684 // TODO: There is also f64 rsq instruction, but the documentation is less
5685 // clear on its precision.
5686
5687 return SDValue();
5688}
5689
5691 SelectionDAG &DAG, int Enabled,
5692 int &RefinementSteps) const {
5693 EVT VT = Operand.getValueType();
5694
5695 if (VT == MVT::f32) {
5696 // Reciprocal, < 1 ulp error.
5697 //
5698 // This reciprocal approximation converges to < 0.5 ulp error with one
5699 // newton rhapson performed with two fused multiple adds (FMAs).
5700
5701 RefinementSteps = 0;
5702 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5703 }
5704
5705 // TODO: There is also f64 rcp instruction, but the documentation is less
5706 // clear on its precision.
5707
5708 return SDValue();
5709}
5710
5711static unsigned workitemIntrinsicDim(unsigned ID) {
5712 switch (ID) {
5713 case Intrinsic::amdgcn_workitem_id_x:
5714 return 0;
5715 case Intrinsic::amdgcn_workitem_id_y:
5716 return 1;
5717 case Intrinsic::amdgcn_workitem_id_z:
5718 return 2;
5719 default:
5720 llvm_unreachable("not a workitem intrinsic");
5721 }
5722}
5723
5725 const SDValue Op, KnownBits &Known,
5726 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5727
5728 Known.resetAll(); // Don't know anything.
5729
5730 unsigned Opc = Op.getOpcode();
5731
5732 switch (Opc) {
5733 default:
5734 break;
5735 case AMDGPUISD::CARRY:
5736 case AMDGPUISD::BORROW: {
5737 Known.Zero = APInt::getHighBitsSet(32, 31);
5738 break;
5739 }
5740
5741 case AMDGPUISD::BFE_I32:
5742 case AMDGPUISD::BFE_U32: {
5743 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5744 if (!CWidth)
5745 return;
5746
5747 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5748
5749 if (Opc == AMDGPUISD::BFE_U32)
5750 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5751
5752 break;
5753 }
5754 case AMDGPUISD::FP_TO_FP16: {
5755 unsigned BitWidth = Known.getBitWidth();
5756
5757 // High bits are zero.
5759 break;
5760 }
5761 case AMDGPUISD::MUL_U24:
5762 case AMDGPUISD::MUL_I24: {
5763 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5764 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5765 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5766 RHSKnown.countMinTrailingZeros();
5767 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5768 // Skip extra check if all bits are known zeros.
5769 if (TrailZ >= 32)
5770 break;
5771
5772 // Truncate to 24 bits.
5773 LHSKnown = LHSKnown.trunc(24);
5774 RHSKnown = RHSKnown.trunc(24);
5775
5776 if (Opc == AMDGPUISD::MUL_I24) {
5777 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5778 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5779 unsigned MaxValBits = LHSValBits + RHSValBits;
5780 if (MaxValBits > 32)
5781 break;
5782 unsigned SignBits = 32 - MaxValBits + 1;
5783 bool LHSNegative = LHSKnown.isNegative();
5784 bool LHSNonNegative = LHSKnown.isNonNegative();
5785 bool LHSPositive = LHSKnown.isStrictlyPositive();
5786 bool RHSNegative = RHSKnown.isNegative();
5787 bool RHSNonNegative = RHSKnown.isNonNegative();
5788 bool RHSPositive = RHSKnown.isStrictlyPositive();
5789
5790 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5791 Known.Zero.setHighBits(SignBits);
5792 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5793 Known.One.setHighBits(SignBits);
5794 } else {
5795 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5796 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5797 unsigned MaxValBits = LHSValBits + RHSValBits;
5798 if (MaxValBits >= 32)
5799 break;
5800 Known.Zero.setBitsFrom(MaxValBits);
5801 }
5802 break;
5803 }
5804 case AMDGPUISD::PERM: {
5805 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5806 if (!CMask)
5807 return;
5808
5809 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5810 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5811 unsigned Sel = CMask->getZExtValue();
5812
5813 for (unsigned I = 0; I < 32; I += 8) {
5814 unsigned SelBits = Sel & 0xff;
5815 if (SelBits < 4) {
5816 SelBits *= 8;
5817 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5818 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5819 } else if (SelBits < 7) {
5820 SelBits = (SelBits & 3) * 8;
5821 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5822 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5823 } else if (SelBits == 0x0c) {
5824 Known.Zero |= 0xFFull << I;
5825 } else if (SelBits > 0x0c) {
5826 Known.One |= 0xFFull << I;
5827 }
5828 Sel >>= 8;
5829 }
5830 break;
5831 }
5832 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5833 Known.Zero.setHighBits(24);
5834 break;
5835 }
5836 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5837 Known.Zero.setHighBits(16);
5838 break;
5839 }
5840 case AMDGPUISD::LDS: {
5841 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5842 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5843
5844 Known.Zero.setHighBits(16);
5845 Known.Zero.setLowBits(Log2(Alignment));
5846 break;
5847 }
5848 case AMDGPUISD::SMIN3:
5849 case AMDGPUISD::SMAX3:
5850 case AMDGPUISD::SMED3:
5851 case AMDGPUISD::UMIN3:
5852 case AMDGPUISD::UMAX3:
5853 case AMDGPUISD::UMED3: {
5854 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5855 if (Known2.isUnknown())
5856 break;
5857
5858 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5859 if (Known1.isUnknown())
5860 break;
5861
5862 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5863 if (Known0.isUnknown())
5864 break;
5865
5866 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5867 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5868 Known.One = Known0.One & Known1.One & Known2.One;
5869 break;
5870 }
5872 unsigned IID = Op.getConstantOperandVal(0);
5873 switch (IID) {
5874 case Intrinsic::amdgcn_workitem_id_x:
5875 case Intrinsic::amdgcn_workitem_id_y:
5876 case Intrinsic::amdgcn_workitem_id_z: {
5877 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5879 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5880 break;
5881 }
5882 default:
5883 break;
5884 }
5885 }
5886 }
5887}
5888
5890 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5891 unsigned Depth) const {
5892 switch (Op.getOpcode()) {
5893 case AMDGPUISD::BFE_I32: {
5894 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5895 if (!Width)
5896 return 1;
5897
5898 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5899 if (!isNullConstant(Op.getOperand(1)))
5900 return SignBits;
5901
5902 // TODO: Could probably figure something out with non-0 offsets.
5903 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5904 return std::max(SignBits, Op0SignBits);
5905 }
5906
5907 case AMDGPUISD::BFE_U32: {
5908 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5909 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5910 }
5911
5912 case AMDGPUISD::CARRY:
5913 case AMDGPUISD::BORROW:
5914 return 31;
5915 case AMDGPUISD::BUFFER_LOAD_BYTE:
5916 return 25;
5917 case AMDGPUISD::BUFFER_LOAD_SHORT:
5918 return 17;
5919 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5920 return 24;
5921 case AMDGPUISD::BUFFER_LOAD_USHORT:
5922 return 16;
5923 case AMDGPUISD::FP_TO_FP16:
5924 return 16;
5925 case AMDGPUISD::SMIN3:
5926 case AMDGPUISD::SMAX3:
5927 case AMDGPUISD::SMED3:
5928 case AMDGPUISD::UMIN3:
5929 case AMDGPUISD::UMAX3:
5930 case AMDGPUISD::UMED3: {
5931 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5932 if (Tmp2 == 1)
5933 return 1; // Early out.
5934
5935 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5936 if (Tmp1 == 1)
5937 return 1; // Early out.
5938
5939 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5940 if (Tmp0 == 1)
5941 return 1; // Early out.
5942
5943 return std::min({Tmp0, Tmp1, Tmp2});
5944 }
5945 default:
5946 return 1;
5947 }
5948}
5949
5951 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
5952 const MachineRegisterInfo &MRI, unsigned Depth) const {
5953 const MachineInstr *MI = MRI.getVRegDef(R);
5954 if (!MI)
5955 return 1;
5956
5957 // TODO: Check range metadata on MMO.
5958 switch (MI->getOpcode()) {
5959 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5960 return 25;
5961 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5962 return 17;
5963 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5964 return 24;
5965 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5966 return 16;
5967 case AMDGPU::G_AMDGPU_SMED3:
5968 case AMDGPU::G_AMDGPU_UMED3: {
5969 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5970 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5971 if (Tmp2 == 1)
5972 return 1;
5973 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5974 if (Tmp1 == 1)
5975 return 1;
5976 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5977 if (Tmp0 == 1)
5978 return 1;
5979 return std::min({Tmp0, Tmp1, Tmp2});
5980 }
5981 default:
5982 return 1;
5983 }
5984}
5985
5987 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5988 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
5989 unsigned Opcode = Op.getOpcode();
5990 switch (Opcode) {
5991 case AMDGPUISD::BFE_I32:
5992 case AMDGPUISD::BFE_U32:
5993 return false;
5994 }
5996 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
5997}
5998
6000 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6001 unsigned Depth) const {
6002 unsigned Opcode = Op.getOpcode();
6003 switch (Opcode) {
6004 case AMDGPUISD::FMIN_LEGACY:
6005 case AMDGPUISD::FMAX_LEGACY: {
6006 if (SNaN)
6007 return true;
6008
6009 // TODO: Can check no nans on one of the operands for each one, but which
6010 // one?
6011 return false;
6012 }
6013 case AMDGPUISD::FMUL_LEGACY:
6014 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6015 if (SNaN)
6016 return true;
6017 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6018 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6019 }
6020 case AMDGPUISD::FMED3:
6021 case AMDGPUISD::FMIN3:
6022 case AMDGPUISD::FMAX3:
6023 case AMDGPUISD::FMINIMUM3:
6024 case AMDGPUISD::FMAXIMUM3:
6025 case AMDGPUISD::FMAD_FTZ: {
6026 if (SNaN)
6027 return true;
6028 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6029 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6030 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6031 }
6032 case AMDGPUISD::CVT_F32_UBYTE0:
6033 case AMDGPUISD::CVT_F32_UBYTE1:
6034 case AMDGPUISD::CVT_F32_UBYTE2:
6035 case AMDGPUISD::CVT_F32_UBYTE3:
6036 return true;
6037
6038 case AMDGPUISD::RCP:
6039 case AMDGPUISD::RSQ:
6040 case AMDGPUISD::RCP_LEGACY:
6041 case AMDGPUISD::RSQ_CLAMP: {
6042 if (SNaN)
6043 return true;
6044
6045 // TODO: Need is known positive check.
6046 return false;
6047 }
6048 case ISD::FLDEXP:
6049 case AMDGPUISD::FRACT: {
6050 if (SNaN)
6051 return true;
6052 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6053 }
6054 case AMDGPUISD::DIV_SCALE:
6055 case AMDGPUISD::DIV_FMAS:
6056 case AMDGPUISD::DIV_FIXUP:
6057 // TODO: Refine on operands.
6058 return SNaN;
6059 case AMDGPUISD::SIN_HW:
6060 case AMDGPUISD::COS_HW: {
6061 // TODO: Need check for infinity
6062 return SNaN;
6063 }
6065 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6066 // TODO: Handle more intrinsics
6067 switch (IntrinsicID) {
6068 case Intrinsic::amdgcn_cubeid:
6069 case Intrinsic::amdgcn_cvt_off_f32_i4:
6070 return true;
6071
6072 case Intrinsic::amdgcn_frexp_mant: {
6073 if (SNaN)
6074 return true;
6075 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6076 }
6077 case Intrinsic::amdgcn_cvt_pkrtz: {
6078 if (SNaN)
6079 return true;
6080 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6081 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6082 }
6083 case Intrinsic::amdgcn_rcp:
6084 case Intrinsic::amdgcn_rsq:
6085 case Intrinsic::amdgcn_rcp_legacy:
6086 case Intrinsic::amdgcn_rsq_legacy:
6087 case Intrinsic::amdgcn_rsq_clamp:
6088 case Intrinsic::amdgcn_tanh: {
6089 if (SNaN)
6090 return true;
6091
6092 // TODO: Need is known positive check.
6093 return false;
6094 }
6095 case Intrinsic::amdgcn_trig_preop:
6096 case Intrinsic::amdgcn_fdot2:
6097 // TODO: Refine on operand
6098 return SNaN;
6099 case Intrinsic::amdgcn_fma_legacy:
6100 if (SNaN)
6101 return true;
6102 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6103 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6104 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6105 default:
6106 return false;
6107 }
6108 }
6109 default:
6110 return false;
6111 }
6112}
6113
6115 Register N0, Register N1) const {
6116 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6117}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1396
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1163
const fltSemantics & getSemantics() const
Definition APFloat.h:1439
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1181
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1151
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Type * getValueType() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1551
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:269
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...