LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f32, MVT::i64);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
182
184 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
185
187 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f32, MVT::i64);
188
189 // There are no 64-bit extloads. These should be done as a 32-bit extload and
190 // an extension to 64-bit.
191 for (MVT VT : MVT::integer_valuetypes())
193 Expand);
194
195 for (MVT VT : MVT::integer_valuetypes()) {
196 if (VT == MVT::i64)
197 continue;
198
199 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
200 setLoadExtAction(Op, VT, MVT::i1, Promote);
201 setLoadExtAction(Op, VT, MVT::i8, Legal);
202 setLoadExtAction(Op, VT, MVT::i16, Legal);
203 setLoadExtAction(Op, VT, MVT::i32, Expand);
204 }
205 }
206
208 for (auto MemVT :
209 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
211 Expand);
212
213 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
227
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
234
235 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
241 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
242 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
243 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
244 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
245 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
246 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
247
249 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
283
285 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
289
291 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
319
321 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
322
324 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
325
327 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
328
329 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
330 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
331 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
332 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
333
334 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
335 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
336 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
337 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
338
339 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
340 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
341 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
342 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
343 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
344 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
345 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
346 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
347 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
348 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
349 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
350 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
351 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
352 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
353 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
354
355 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
356 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
357 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
358
359 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
360 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
361 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
362
363 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
364
365 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
366 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
367 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
368 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
369 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
370 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
371 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
372
373 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
374 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
375 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
376 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
377 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
378
379 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
380 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
381 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
382
383 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
384 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
385 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
386
387 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
388 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
389 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
390
391 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
392 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
393 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
394
395 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
396 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
397 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
398 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
399 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
400 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
401 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
402
403 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
404 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
405
407
408 // For R600, this is totally unsupported, just custom lower to produce an
409 // error.
411
412 // Library functions. These default to Expand, but we have instructions
413 // for them.
416 {MVT::f16, MVT::f32}, Legal);
418
420 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
422 {MVT::f16, MVT::f32, MVT::f64}, Expand);
423
426 Custom);
428
429 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
430
431 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
432
433 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
434 Expand);
435
436 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
437 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
439
441 Custom);
442
443 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
444
445 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
446 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
447 // default unless marked custom/legal.
449 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
450 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
451 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
452 MVT::v16f64},
453 Custom);
454
455 // Expand to fneg + fadd.
457
459 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
460 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 Custom);
465
468 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
469 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
470 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
471 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
472 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
473 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
474 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
475 Custom);
476
478 Expand);
479 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
480
481 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
482 for (MVT VT : ScalarIntVTs) {
483 // These should use [SU]DIVREM, so set them to expand
485 Expand);
486
487 // GPU does not have divrem function for signed or unsigned.
489
490 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
492
494
495 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
497 }
498
499 // The hardware supports 32-bit FSHR, but not FSHL.
501
502 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
503
505
510 MVT::i64, Custom);
512
514 Legal);
515
518 MVT::i64, Custom);
519
520 for (auto VT : {MVT::i8, MVT::i16})
522
523 static const MVT::SimpleValueType VectorIntTypes[] = {
524 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
525 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
526
527 for (MVT VT : VectorIntTypes) {
528 // Expand the following operations for the current type by default.
529 // clang-format off
549 VT, Expand);
550 // clang-format on
551 }
552
553 static const MVT::SimpleValueType FloatVectorTypes[] = {
554 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
555 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
556
557 for (MVT VT : FloatVectorTypes) {
570 VT, Expand);
571 }
572
573 // This causes using an unrolled select operation rather than expansion with
574 // bit operations. This is in general better, but the alternative using BFI
575 // instructions may be better if the select sources are SGPRs.
577 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
581
583 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
584
586 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
587
589 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
590
592 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
593
595 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
596
598 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
599
601 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
602
604 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
605
607 setJumpIsExpensive(true);
608
611
613
614 // We want to find all load dependencies for long chains of stores to enable
615 // merging into very wide vectors. The problem is with vectors with > 4
616 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
617 // vectors are a legal type, even though we have to split the loads
618 // usually. When we can more precisely specify load legality per address
619 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
620 // smarter so that they can figure out what to do in 2 iterations without all
621 // N > 4 stores on the same chain.
623
624 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
625 // about these during lowering.
626 MaxStoresPerMemcpy = 0xffffffff;
627 MaxStoresPerMemmove = 0xffffffff;
628 MaxStoresPerMemset = 0xffffffff;
629
630 // The expansion for 64-bit division is enormous.
632 addBypassSlowDiv(64, 32);
633
644
648}
649
651 const auto Flags = Op.getNode()->getFlags();
652 if (Flags.hasNoSignedZeros())
653 return true;
654
655 return false;
656}
657
658//===----------------------------------------------------------------------===//
659// Target Information
660//===----------------------------------------------------------------------===//
661
663static bool fnegFoldsIntoOpcode(unsigned Opc) {
664 switch (Opc) {
665 case ISD::FADD:
666 case ISD::FSUB:
667 case ISD::FMUL:
668 case ISD::FMA:
669 case ISD::FMAD:
670 case ISD::FMINNUM:
671 case ISD::FMAXNUM:
674 case ISD::FMINIMUM:
675 case ISD::FMAXIMUM:
676 case ISD::FMINIMUMNUM:
677 case ISD::FMAXIMUMNUM:
678 case ISD::SELECT:
679 case ISD::FSIN:
680 case ISD::FTRUNC:
681 case ISD::FRINT:
682 case ISD::FNEARBYINT:
683 case ISD::FROUNDEVEN:
685 case AMDGPUISD::RCP:
686 case AMDGPUISD::RCP_LEGACY:
687 case AMDGPUISD::RCP_IFLAG:
688 case AMDGPUISD::SIN_HW:
689 case AMDGPUISD::FMUL_LEGACY:
690 case AMDGPUISD::FMIN_LEGACY:
691 case AMDGPUISD::FMAX_LEGACY:
692 case AMDGPUISD::FMED3:
693 // TODO: handle llvm.amdgcn.fma.legacy
694 return true;
695 case ISD::BITCAST:
696 llvm_unreachable("bitcast is special cased");
697 default:
698 return false;
699 }
700}
701
702static bool fnegFoldsIntoOp(const SDNode *N) {
703 unsigned Opc = N->getOpcode();
704 if (Opc == ISD::BITCAST) {
705 // TODO: Is there a benefit to checking the conditions performFNegCombine
706 // does? We don't for the other cases.
707 SDValue BCSrc = N->getOperand(0);
708 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
709 return BCSrc.getNumOperands() == 2 &&
710 BCSrc.getOperand(1).getValueSizeInBits() == 32;
711 }
712
713 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
714 }
715
716 return fnegFoldsIntoOpcode(Opc);
717}
718
719/// \p returns true if the operation will definitely need to use a 64-bit
720/// encoding, and thus will use a VOP3 encoding regardless of the source
721/// modifiers.
723static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
724 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
725 VT == MVT::f64;
726}
727
728/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
729/// type for ISD::SELECT.
731static bool selectSupportsSourceMods(const SDNode *N) {
732 // TODO: Only applies if select will be vector
733 return N->getValueType(0) == MVT::f32;
734}
735
736// Most FP instructions support source modifiers, but this could be refined
737// slightly.
739static bool hasSourceMods(const SDNode *N) {
740 if (isa<MemSDNode>(N))
741 return false;
742
743 switch (N->getOpcode()) {
744 case ISD::CopyToReg:
745 case ISD::FDIV:
746 case ISD::FREM:
747 case ISD::INLINEASM:
749 case AMDGPUISD::DIV_SCALE:
751
752 // TODO: Should really be looking at the users of the bitcast. These are
753 // problematic because bitcasts are used to legalize all stores to integer
754 // types.
755 case ISD::BITCAST:
756 return false;
758 switch (N->getConstantOperandVal(0)) {
759 case Intrinsic::amdgcn_interp_p1:
760 case Intrinsic::amdgcn_interp_p2:
761 case Intrinsic::amdgcn_interp_mov:
762 case Intrinsic::amdgcn_interp_p1_f16:
763 case Intrinsic::amdgcn_interp_p2_f16:
764 return false;
765 default:
766 return true;
767 }
768 }
769 case ISD::SELECT:
771 default:
772 return true;
773 }
774}
775
777 unsigned CostThreshold) {
778 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
779 // it is truly free to use a source modifier in all cases. If there are
780 // multiple users but for each one will necessitate using VOP3, there will be
781 // a code size increase. Try to avoid increasing code size unless we know it
782 // will save on the instruction count.
783 unsigned NumMayIncreaseSize = 0;
784 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
785
786 assert(!N->use_empty());
787
788 // XXX - Should this limit number of uses to check?
789 for (const SDNode *U : N->users()) {
790 if (!hasSourceMods(U))
791 return false;
792
793 if (!opMustUseVOP3Encoding(U, VT)) {
794 if (++NumMayIncreaseSize > CostThreshold)
795 return false;
796 }
797 }
798
799 return true;
800}
801
803 ISD::NodeType ExtendKind) const {
804 assert(!VT.isVector() && "only scalar expected");
805
806 // Round to the next multiple of 32-bits.
807 unsigned Size = VT.getSizeInBits();
808 if (Size <= 32)
809 return MVT::i32;
810 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
811}
812
814 return 32;
815}
816
818 return true;
819}
820
821// The backend supports 32 and 64 bit floating point immediates.
822// FIXME: Why are we reporting vectors of FP immediates as legal?
824 bool ForCodeSize) const {
825 return isTypeLegal(VT.getScalarType());
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 // Report this based on the end legalized type.
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ADD:
1045 case ISD::SUB:
1046 case ISD::SHL:
1047 case ISD::SRL:
1048 case ISD::SRA:
1049 case ISD::AND:
1050 case ISD::OR:
1051 case ISD::XOR:
1052 case ISD::MUL:
1053 case ISD::SETCC:
1054 case ISD::SELECT:
1055 case ISD::SMIN:
1056 case ISD::SMAX:
1057 case ISD::UMIN:
1058 case ISD::UMAX:
1059 if (isTypeLegal(MVT::i16) &&
1060 (!DestVT.isVector() ||
1061 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1062 // Don't narrow back down to i16 if promoted to i32 already.
1063 if (!N->isDivergent() && DestVT.isInteger() &&
1064 DestVT.getScalarSizeInBits() > 1 &&
1065 DestVT.getScalarSizeInBits() <= 16 &&
1066 SrcVT.getScalarSizeInBits() > 16) {
1067 return false;
1068 }
1069 }
1070 return true;
1071 default:
1072 break;
1073 }
1074
1075 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1076 // limited number of native 64-bit operations. Shrinking an operation to fit
1077 // in a single 32-bit register should always be helpful. As currently used,
1078 // this is much less general than the name suggests, and is only used in
1079 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1080 // not profitable, and may actually be harmful.
1081 if (isa<LoadSDNode>(N))
1082 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1083
1084 return true;
1085}
1086
1088 const SDNode* N, CombineLevel Level) const {
1089 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1090 N->getOpcode() == ISD::SRL) &&
1091 "Expected shift op");
1092
1093 SDValue ShiftLHS = N->getOperand(0);
1094 if (!ShiftLHS->hasOneUse())
1095 return false;
1096
1097 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1098 !ShiftLHS.getOperand(0)->hasOneUse())
1099 return false;
1100
1101 // Always commute pre-type legalization and right shifts.
1102 // We're looking for shl(or(x,y),z) patterns.
1104 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1105 return true;
1106
1107 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1108 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1109 (N->user_begin()->getOpcode() == ISD::SRA ||
1110 N->user_begin()->getOpcode() == ISD::SRL))
1111 return false;
1112
1113 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1114 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1115 if (LHS.getOpcode() != ISD::SHL)
1116 return false;
1117 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1118 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1119 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1120 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1121 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1122 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1123 };
1124 SDValue LHS = N->getOperand(0).getOperand(0);
1125 SDValue RHS = N->getOperand(0).getOperand(1);
1126 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1127}
1128
1129//===---------------------------------------------------------------------===//
1130// TargetLowering Callbacks
1131//===---------------------------------------------------------------------===//
1132
1134 bool IsVarArg) {
1135 switch (CC) {
1143 return CC_AMDGPU;
1146 return CC_AMDGPU_CS_CHAIN;
1147 case CallingConv::C:
1148 case CallingConv::Fast:
1149 case CallingConv::Cold:
1150 return CC_AMDGPU_Func;
1153 return CC_SI_Gfx;
1156 default:
1157 reportFatalUsageError("unsupported calling convention for call");
1158 }
1159}
1160
1162 bool IsVarArg) {
1163 switch (CC) {
1166 llvm_unreachable("kernels should not be handled here");
1176 return RetCC_SI_Shader;
1179 return RetCC_SI_Gfx;
1180 case CallingConv::C:
1181 case CallingConv::Fast:
1182 case CallingConv::Cold:
1183 return RetCC_AMDGPU_Func;
1184 default:
1185 reportFatalUsageError("unsupported calling convention");
1186 }
1187}
1188
1189/// The SelectionDAGBuilder will automatically promote function arguments
1190/// with illegal types. However, this does not work for the AMDGPU targets
1191/// since the function arguments are stored in memory as these illegal types.
1192/// In order to handle this properly we need to get the original types sizes
1193/// from the LLVM IR Function and fixup the ISD:InputArg values before
1194/// passing them to AnalyzeFormalArguments()
1195
1196/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1197/// input values across multiple registers. Each item in the Ins array
1198/// represents a single value that will be stored in registers. Ins[x].VT is
1199/// the value type of the value that will be stored in the register, so
1200/// whatever SDNode we lower the argument to needs to be this type.
1201///
1202/// In order to correctly lower the arguments we need to know the size of each
1203/// argument. Since Ins[x].VT gives us the size of the register that will
1204/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1205/// for the original function argument so that we can deduce the correct memory
1206/// type to use for Ins[x]. In most cases the correct memory type will be
1207/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1208/// we have a kernel argument of type v8i8, this argument will be split into
1209/// 8 parts and each part will be represented by its own item in the Ins array.
1210/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1211/// the argument before it was split. From this, we deduce that the memory type
1212/// for each individual part is i8. We pass the memory type as LocVT to the
1213/// calling convention analysis function and the register type (Ins[x].VT) as
1214/// the ValVT.
1216 CCState &State,
1217 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1218 const MachineFunction &MF = State.getMachineFunction();
1219 const Function &Fn = MF.getFunction();
1220 LLVMContext &Ctx = Fn.getContext();
1221 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1223
1224 Align MaxAlign = Align(1);
1225 uint64_t ExplicitArgOffset = 0;
1226 const DataLayout &DL = Fn.getDataLayout();
1227
1228 unsigned InIndex = 0;
1229
1230 for (const Argument &Arg : Fn.args()) {
1231 const bool IsByRef = Arg.hasByRefAttr();
1232 Type *BaseArgTy = Arg.getType();
1233 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1234 Align Alignment = DL.getValueOrABITypeAlignment(
1235 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1236 MaxAlign = std::max(Alignment, MaxAlign);
1237 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1238
1239 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1240 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1241
1242 // We're basically throwing away everything passed into us and starting over
1243 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1244 // to us as computed in Ins.
1245 //
1246 // We also need to figure out what type legalization is trying to do to get
1247 // the correct memory offsets.
1248
1249 SmallVector<EVT, 16> ValueVTs;
1251 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1252 &Offsets, ArgOffset);
1253
1254 for (unsigned Value = 0, NumValues = ValueVTs.size();
1255 Value != NumValues; ++Value) {
1256 uint64_t BasePartOffset = Offsets[Value];
1257
1258 EVT ArgVT = ValueVTs[Value];
1259 EVT MemVT = ArgVT;
1260 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1261 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1262
1263 if (NumRegs == 1) {
1264 // This argument is not split, so the IR type is the memory type.
1265 if (ArgVT.isExtended()) {
1266 // We have an extended type, like i24, so we should just use the
1267 // register type.
1268 MemVT = RegisterVT;
1269 } else {
1270 MemVT = ArgVT;
1271 }
1272 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1273 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1274 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1275 // We have a vector value which has been split into a vector with
1276 // the same scalar type, but fewer elements. This should handle
1277 // all the floating-point vector types.
1278 MemVT = RegisterVT;
1279 } else if (ArgVT.isVector() &&
1280 ArgVT.getVectorNumElements() == NumRegs) {
1281 // This arg has been split so that each element is stored in a separate
1282 // register.
1283 MemVT = ArgVT.getScalarType();
1284 } else if (ArgVT.isExtended()) {
1285 // We have an extended type, like i65.
1286 MemVT = RegisterVT;
1287 } else {
1288 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1289 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1290 if (RegisterVT.isInteger()) {
1291 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1292 } else if (RegisterVT.isVector()) {
1293 assert(!RegisterVT.getScalarType().isFloatingPoint());
1294 unsigned NumElements = RegisterVT.getVectorNumElements();
1295 assert(MemoryBits % NumElements == 0);
1296 // This vector type has been split into another vector type with
1297 // a different elements size.
1298 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1299 MemoryBits / NumElements);
1300 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1301 } else {
1302 llvm_unreachable("cannot deduce memory type.");
1303 }
1304 }
1305
1306 // Convert one element vectors to scalar.
1307 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1308 MemVT = MemVT.getScalarType();
1309
1310 // Round up vec3/vec5 argument.
1311 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1312 MemVT = MemVT.getPow2VectorType(State.getContext());
1313 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1314 MemVT = MemVT.getRoundIntegerType(State.getContext());
1315 }
1316
1317 unsigned PartOffset = 0;
1318 for (unsigned i = 0; i != NumRegs; ++i) {
1319 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1320 BasePartOffset + PartOffset,
1321 MemVT.getSimpleVT(),
1323 PartOffset += MemVT.getStoreSize();
1324 }
1325 }
1326 }
1327}
1328
1330 SDValue Chain, CallingConv::ID CallConv,
1331 bool isVarArg,
1333 const SmallVectorImpl<SDValue> &OutVals,
1334 const SDLoc &DL, SelectionDAG &DAG) const {
1335 // FIXME: Fails for r600 tests
1336 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1337 // "wave terminate should not have return values");
1338 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1339}
1340
1341//===---------------------------------------------------------------------===//
1342// Target specific lowering
1343//===---------------------------------------------------------------------===//
1344
1345/// Selects the correct CCAssignFn for a given CallingConvention value.
1350
1355
1357 SelectionDAG &DAG,
1358 MachineFrameInfo &MFI,
1359 int ClobberedFI) const {
1360 SmallVector<SDValue, 8> ArgChains;
1361 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1362 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1363
1364 // Include the original chain at the beginning of the list. When this is
1365 // used by target LowerCall hooks, this helps legalize find the
1366 // CALLSEQ_BEGIN node.
1367 ArgChains.push_back(Chain);
1368
1369 // Add a chain value for each stack argument corresponding
1370 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1371 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1372 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1373 if (FI->getIndex() < 0) {
1374 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1375 int64_t InLastByte = InFirstByte;
1376 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1377
1378 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1379 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1380 ArgChains.push_back(SDValue(L, 1));
1381 }
1382 }
1383 }
1384 }
1385
1386 // Build a tokenfactor for all the chains.
1387 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1388}
1389
1392 StringRef Reason) const {
1393 SDValue Callee = CLI.Callee;
1394 SelectionDAG &DAG = CLI.DAG;
1395
1396 const Function &Fn = DAG.getMachineFunction().getFunction();
1397
1398 StringRef FuncName("<unknown>");
1399
1401 FuncName = G->getSymbol();
1402 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1403 FuncName = G->getGlobal()->getName();
1404
1405 DAG.getContext()->diagnose(
1406 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1407
1408 if (!CLI.IsTailCall) {
1409 for (ISD::InputArg &Arg : CLI.Ins)
1410 InVals.push_back(DAG.getPOISON(Arg.VT));
1411 }
1412
1413 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1414 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1415 return CLI.Chain;
1416
1417 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1418 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1419}
1420
1422 SmallVectorImpl<SDValue> &InVals) const {
1423 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1424}
1425
1427 SelectionDAG &DAG) const {
1428 const Function &Fn = DAG.getMachineFunction().getFunction();
1429
1431 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1432 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1433 return DAG.getMergeValues(Ops, SDLoc());
1434}
1435
1437 SelectionDAG &DAG) const {
1438 switch (Op.getOpcode()) {
1439 default:
1440 Op->print(errs(), &DAG);
1441 llvm_unreachable("Custom lowering code for this "
1442 "instruction is not implemented yet!");
1443 break;
1445 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1447 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1448 case ISD::SDIVREM:
1449 return LowerSDIVREM(Op, DAG);
1450 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1451 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1452 case ISD::FRINT: return LowerFRINT(Op, DAG);
1453 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1454 case ISD::FROUNDEVEN:
1455 return LowerFROUNDEVEN(Op, DAG);
1456 case ISD::FROUND: return LowerFROUND(Op, DAG);
1457 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1458 case ISD::FLOG2:
1459 return LowerFLOG2(Op, DAG);
1460 case ISD::FLOG:
1461 case ISD::FLOG10:
1462 return LowerFLOGCommon(Op, DAG);
1463 case ISD::FEXP:
1464 case ISD::FEXP10:
1465 return lowerFEXP(Op, DAG);
1466 case ISD::FEXP2:
1467 return lowerFEXP2(Op, DAG);
1468 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1469 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1470 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1471 case ISD::FP_TO_SINT:
1472 case ISD::FP_TO_UINT:
1473 return LowerFP_TO_INT(Op, DAG);
1476 return LowerFP_TO_INT_SAT(Op, DAG);
1477 case ISD::CTTZ:
1479 case ISD::CTLZ:
1481 return LowerCTLZ_CTTZ(Op, DAG);
1482 case ISD::CTLS:
1483 return LowerCTLS(Op, DAG);
1485 }
1486 return Op;
1487}
1488
1491 SelectionDAG &DAG) const {
1492 switch (N->getOpcode()) {
1494 // Different parts of legalization seem to interpret which type of
1495 // sign_extend_inreg is the one to check for custom lowering. The extended
1496 // from type is what really matters, but some places check for custom
1497 // lowering of the result type. This results in trying to use
1498 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1499 // nothing here and let the illegal result integer be handled normally.
1500 return;
1501 case ISD::FLOG2:
1502 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1503 Results.push_back(Lowered);
1504 return;
1505 case ISD::FLOG:
1506 case ISD::FLOG10:
1507 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1508 Results.push_back(Lowered);
1509 return;
1510 case ISD::FEXP2:
1511 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1512 Results.push_back(Lowered);
1513 return;
1514 case ISD::FEXP:
1515 case ISD::FEXP10:
1516 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1517 Results.push_back(Lowered);
1518 return;
1519 case ISD::CTLZ:
1521 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1522 Results.push_back(Lowered);
1523 return;
1524 default:
1525 return;
1526 }
1527}
1528
1530 SDValue Op,
1531 SelectionDAG &DAG) const {
1532
1533 const DataLayout &DL = DAG.getDataLayout();
1535 const GlobalValue *GV = G->getGlobal();
1536
1537 if (!MFI->isModuleEntryFunction()) {
1538 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1539 if (std::optional<uint32_t> Address =
1541 if (IsNamedBarrier) {
1542 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1543 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1544 }
1545 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1546 } else if (IsNamedBarrier) {
1547 llvm_unreachable("named barrier should have an assigned address");
1548 }
1549 }
1550
1551 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1552 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1553 if (!MFI->isModuleEntryFunction() &&
1554 GV->getName() != "llvm.amdgcn.module.lds" &&
1556 SDLoc DL(Op);
1557 const Function &Fn = DAG.getMachineFunction().getFunction();
1559 Fn, "local memory global used by non-kernel function",
1560 DL.getDebugLoc(), DS_Warning));
1561
1562 // We currently don't have a way to correctly allocate LDS objects that
1563 // aren't directly associated with a kernel. We do force inlining of
1564 // functions that use local objects. However, if these dead functions are
1565 // not eliminated, we don't want a compile time error. Just emit a warning
1566 // and a trap, since there should be no callable path here.
1567 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1568 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1569 Trap, DAG.getRoot());
1570 DAG.setRoot(OutputChain);
1571 return DAG.getPOISON(Op.getValueType());
1572 }
1573
1574 // XXX: What does the value of G->getOffset() mean?
1575 assert(G->getOffset() == 0 &&
1576 "Do not know what to do with an non-zero offset");
1577
1578 // TODO: We could emit code to handle the initialization somewhere.
1579 // We ignore the initializer for now and legalize it to allow selection.
1580 // The initializer will anyway get errored out during assembly emission.
1581 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1582 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1583 }
1584 return SDValue();
1585}
1586
1588 SelectionDAG &DAG) const {
1590 SDLoc SL(Op);
1591
1592 EVT VT = Op.getValueType();
1593 if (VT.getVectorElementType().getSizeInBits() < 32) {
1594 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1595 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1596 unsigned NewNumElt = OpBitSize / 32;
1597 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1599 MVT::i32, NewNumElt);
1600 for (const SDUse &U : Op->ops()) {
1601 SDValue In = U.get();
1602 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1603 if (NewNumElt > 1)
1604 DAG.ExtractVectorElements(NewIn, Args);
1605 else
1606 Args.push_back(NewIn);
1607 }
1608
1609 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1610 NewNumElt * Op.getNumOperands());
1611 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1612 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1613 }
1614 }
1615
1616 for (const SDUse &U : Op->ops())
1617 DAG.ExtractVectorElements(U.get(), Args);
1618
1619 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1620}
1621
1623 SelectionDAG &DAG) const {
1624 SDLoc SL(Op);
1626 unsigned Start = Op.getConstantOperandVal(1);
1627 EVT VT = Op.getValueType();
1628 EVT SrcVT = Op.getOperand(0).getValueType();
1629
1630 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1631 unsigned NumElt = VT.getVectorNumElements();
1632 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1633 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1634
1635 // Extract 32-bit registers at a time.
1636 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1637 EVT NewVT = NumElt == 2
1638 ? MVT::i32
1639 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1640 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1641
1642 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1643 if (NumElt == 2)
1644 Tmp = Args[0];
1645 else
1646 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1647
1648 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1649 }
1650
1651 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1653
1654 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1655}
1656
1657// TODO: Handle fabs too
1659 if (Val.getOpcode() == ISD::FNEG)
1660 return Val.getOperand(0);
1661
1662 return Val;
1663}
1664
1666 if (Val.getOpcode() == ISD::FNEG)
1667 Val = Val.getOperand(0);
1668 if (Val.getOpcode() == ISD::FABS)
1669 Val = Val.getOperand(0);
1670 if (Val.getOpcode() == ISD::FCOPYSIGN)
1671 Val = Val.getOperand(0);
1672 return Val;
1673}
1674
1676 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1677 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1678 SelectionDAG &DAG = DCI.DAG;
1679 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1680 switch (CCOpcode) {
1681 case ISD::SETOEQ:
1682 case ISD::SETONE:
1683 case ISD::SETUNE:
1684 case ISD::SETNE:
1685 case ISD::SETUEQ:
1686 case ISD::SETEQ:
1687 case ISD::SETFALSE:
1688 case ISD::SETFALSE2:
1689 case ISD::SETTRUE:
1690 case ISD::SETTRUE2:
1691 case ISD::SETUO:
1692 case ISD::SETO:
1693 break;
1694 case ISD::SETULE:
1695 case ISD::SETULT: {
1696 if (LHS == True)
1697 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1698 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1699 }
1700 case ISD::SETOLE:
1701 case ISD::SETOLT:
1702 case ISD::SETLE:
1703 case ISD::SETLT: {
1704 // Ordered. Assume ordered for undefined.
1705
1706 // Only do this after legalization to avoid interfering with other combines
1707 // which might occur.
1709 !DCI.isCalledByLegalizer())
1710 return SDValue();
1711
1712 // We need to permute the operands to get the correct NaN behavior. The
1713 // selected operand is the second one based on the failing compare with NaN,
1714 // so permute it based on the compare type the hardware uses.
1715 if (LHS == True)
1716 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1717 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1718 }
1719 case ISD::SETUGE:
1720 case ISD::SETUGT: {
1721 if (LHS == True)
1722 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1723 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1724 }
1725 case ISD::SETGT:
1726 case ISD::SETGE:
1727 case ISD::SETOGE:
1728 case ISD::SETOGT: {
1730 !DCI.isCalledByLegalizer())
1731 return SDValue();
1732
1733 if (LHS == True)
1734 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1735 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1736 }
1737 case ISD::SETCC_INVALID:
1738 llvm_unreachable("Invalid setcc condcode!");
1739 }
1740 return SDValue();
1741}
1742
1743/// Generate Min/Max node
1745 SDValue LHS, SDValue RHS,
1746 SDValue True, SDValue False,
1747 SDValue CC,
1748 DAGCombinerInfo &DCI) const {
1749 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1750 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1751
1752 SelectionDAG &DAG = DCI.DAG;
1753
1754 // If we can't directly match this, try to see if we can fold an fneg to
1755 // match.
1756
1759 SDValue NegTrue = peekFNeg(True);
1760
1761 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1762 // fmin/fmax.
1763 //
1764 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1765 // -> fneg (fmin_legacy lhs, K)
1766 //
1767 // TODO: Use getNegatedExpression
1768 if (LHS == NegTrue && CFalse && CRHS) {
1769 APFloat NegRHS = neg(CRHS->getValueAPF());
1770 if (NegRHS == CFalse->getValueAPF()) {
1771 SDValue Combined =
1772 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1773 if (Combined)
1774 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1775 return SDValue();
1776 }
1777 }
1778
1779 return SDValue();
1780}
1781
1782std::pair<SDValue, SDValue>
1784 SDLoc SL(Op);
1785
1786 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1787
1788 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1789 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1790
1791 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1792 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1793
1794 return std::pair(Lo, Hi);
1795}
1796
1798 SDLoc SL(Op);
1799
1800 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1801 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1802 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1803}
1804
1806 SDLoc SL(Op);
1807
1808 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1809 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1810 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1811}
1812
1813// Split a vector type into two parts. The first part is a power of two vector.
1814// The second part is whatever is left over, and is a scalar if it would
1815// otherwise be a 1-vector.
1816std::pair<EVT, EVT>
1818 EVT LoVT, HiVT;
1819 EVT EltVT = VT.getVectorElementType();
1820 unsigned NumElts = VT.getVectorNumElements();
1821 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1822 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1823 HiVT = NumElts - LoNumElts == 1
1824 ? EltVT
1825 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1826 return std::pair(LoVT, HiVT);
1827}
1828
1829// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1830// scalar.
1831std::pair<SDValue, SDValue>
1833 const EVT &LoVT, const EVT &HiVT,
1834 SelectionDAG &DAG) const {
1835 EVT VT = N.getValueType();
1837 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1838 VT.getVectorNumElements() &&
1839 "More vector elements requested than available!");
1841 DAG.getVectorIdxConstant(0, DL));
1842
1843 unsigned LoNumElts = LoVT.getVectorNumElements();
1844
1845 if (HiVT.isVector()) {
1846 unsigned HiNumElts = HiVT.getVectorNumElements();
1847 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1848 // Avoid creating an extract_subvector with an index that isn't a multiple
1849 // of the result type.
1851 DAG.getConstant(LoNumElts, DL, MVT::i32));
1852 return {Lo, Hi};
1853 }
1854
1856 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1857 /*Count=*/HiNumElts);
1858 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1859 return {Lo, Hi};
1860 }
1861
1863 DAG.getVectorIdxConstant(LoNumElts, DL));
1864 return {Lo, Hi};
1865}
1866
1868 SelectionDAG &DAG) const {
1870 EVT VT = Op.getValueType();
1871 SDLoc SL(Op);
1872
1873
1874 // If this is a 2 element vector, we really want to scalarize and not create
1875 // weird 1 element vectors.
1876 if (VT.getVectorNumElements() == 2) {
1877 SDValue Ops[2];
1878 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1879 return DAG.getMergeValues(Ops, SL);
1880 }
1881
1882 SDValue BasePtr = Load->getBasePtr();
1883 EVT MemVT = Load->getMemoryVT();
1884
1885 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1886
1887 EVT LoVT, HiVT;
1888 EVT LoMemVT, HiMemVT;
1889 SDValue Lo, Hi;
1890
1891 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1892 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1893 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1894
1895 unsigned Size = LoMemVT.getStoreSize();
1896 Align BaseAlign = Load->getAlign();
1897 Align HiAlign = commonAlignment(BaseAlign, Size);
1898
1899 SDValue LoLoad = DAG.getExtLoad(
1900 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1901 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1902 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1903 SDValue HiLoad = DAG.getExtLoad(
1904 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1905 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1906 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1907
1908 SDValue Join;
1909 if (LoVT == HiVT) {
1910 // This is the case that the vector is power of two so was evenly split.
1911 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1912 } else {
1913 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1914 DAG.getVectorIdxConstant(0, SL));
1915 Join = DAG.getNode(
1917 VT, Join, HiLoad,
1919 }
1920
1921 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1922 LoLoad.getValue(1), HiLoad.getValue(1))};
1923
1924 return DAG.getMergeValues(Ops, SL);
1925}
1926
1928 SelectionDAG &DAG) const {
1930 EVT VT = Op.getValueType();
1931 SDValue BasePtr = Load->getBasePtr();
1932 EVT MemVT = Load->getMemoryVT();
1933 SDLoc SL(Op);
1934 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1935 Align BaseAlign = Load->getAlign();
1936 unsigned NumElements = MemVT.getVectorNumElements();
1937
1938 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1939 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1940 if (NumElements != 3 ||
1941 (BaseAlign < Align(8) &&
1942 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1943 return SplitVectorLoad(Op, DAG);
1944
1945 assert(NumElements == 3);
1946
1947 EVT WideVT =
1949 EVT WideMemVT =
1951 SDValue WideLoad = DAG.getExtLoad(
1952 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1953 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1954 return DAG.getMergeValues(
1955 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1956 DAG.getVectorIdxConstant(0, SL)),
1957 WideLoad.getValue(1)},
1958 SL);
1959}
1960
1962 SelectionDAG &DAG) const {
1964 SDValue Val = Store->getValue();
1965 EVT VT = Val.getValueType();
1966
1967 // If this is a 2 element vector, we really want to scalarize and not create
1968 // weird 1 element vectors.
1969 if (VT.getVectorNumElements() == 2)
1970 return scalarizeVectorStore(Store, DAG);
1971
1972 EVT MemVT = Store->getMemoryVT();
1973 SDValue Chain = Store->getChain();
1974 SDValue BasePtr = Store->getBasePtr();
1975 SDLoc SL(Op);
1976
1977 EVT LoVT, HiVT;
1978 EVT LoMemVT, HiMemVT;
1979 SDValue Lo, Hi;
1980
1981 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1982 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1983 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1984
1985 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1986
1987 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1988 Align BaseAlign = Store->getAlign();
1989 unsigned Size = LoMemVT.getStoreSize();
1990 Align HiAlign = commonAlignment(BaseAlign, Size);
1991
1992 SDValue LoStore =
1993 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1994 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1995 SDValue HiStore = DAG.getTruncStore(
1996 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1997 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1998
1999 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
2000}
2001
2002// This is a shortcut for integer division because we have fast i32<->f32
2003// conversions, and fast f32 reciprocal instructions. The fractional part of a
2004// float is enough to accurately represent up to a 24-bit signed integer.
2006 bool Sign) const {
2007 SDLoc DL(Op);
2008 EVT VT = Op.getValueType();
2009 SDValue LHS = Op.getOperand(0);
2010 SDValue RHS = Op.getOperand(1);
2011 MVT IntVT = MVT::i32;
2012 MVT FltVT = MVT::f32;
2013
2014 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2015 if (LHSSignBits < 9)
2016 return SDValue();
2017
2018 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2019 if (RHSSignBits < 9)
2020 return SDValue();
2021
2022 unsigned BitSize = VT.getSizeInBits();
2023 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2024 unsigned DivBits = BitSize - SignBits;
2025 if (Sign)
2026 ++DivBits;
2027
2030
2031 SDValue jq = DAG.getConstant(1, DL, IntVT);
2032
2033 if (Sign) {
2034 // char|short jq = ia ^ ib;
2035 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2036
2037 // jq = jq >> (bitsize - 2)
2038 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2039 DAG.getConstant(BitSize - 2, DL, VT));
2040
2041 // jq = jq | 0x1
2042 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2043 }
2044
2045 // int ia = (int)LHS;
2046 SDValue ia = LHS;
2047
2048 // int ib, (int)RHS;
2049 SDValue ib = RHS;
2050
2051 // float fa = (float)ia;
2052 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2053
2054 // float fb = (float)ib;
2055 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2056
2057 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2058 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2059
2060 // fq = trunc(fq);
2061 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2062
2063 // float fqneg = -fq;
2064 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2065
2067
2068 bool UseFmadFtz = false;
2069 if (Subtarget->isGCN()) {
2071 UseFmadFtz =
2073 }
2074
2075 // float fr = mad(fqneg, fb, fa);
2076 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2077 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2079 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2080
2081 // int iq = (int)fq;
2082 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2083
2084 // fr = fabs(fr);
2085 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2086
2087 // fb = fabs(fb);
2088 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2089
2090 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2091
2092 // int cv = fr >= fb;
2093 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2094
2095 // jq = (cv ? jq : 0);
2096 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2097
2098 // dst = iq + jq;
2099 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2100
2101 // Rem needs compensation, it's easier to recompute it
2102 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2103 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2104
2105 // Truncate to number of bits this divide really is.
2106 if (Sign) {
2107 SDValue InRegSize
2108 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2109 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2110 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2111 } else {
2112 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2113 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2114 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2115 }
2116
2117 return DAG.getMergeValues({ Div, Rem }, DL);
2118}
2119
2121 SelectionDAG &DAG,
2123 SDLoc DL(Op);
2124 EVT VT = Op.getValueType();
2125
2126 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2127
2128 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2129
2130 SDValue One = DAG.getConstant(1, DL, HalfVT);
2131 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2132
2133 //HiLo split
2134 SDValue LHS_Lo, LHS_Hi;
2135 SDValue LHS = Op.getOperand(0);
2136 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2137
2138 SDValue RHS_Lo, RHS_Hi;
2139 SDValue RHS = Op.getOperand(1);
2140 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2141
2142 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2143 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2144
2145 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2146 LHS_Lo, RHS_Lo);
2147
2148 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2149 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2150
2151 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2152 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2153 return;
2154 }
2155
2156 if (isTypeLegal(MVT::i64)) {
2157 // The algorithm here is based on ideas from "Software Integer Division",
2158 // Tom Rodeheffer, August 2008.
2159
2162
2163 // Compute denominator reciprocal.
2164 unsigned FMAD =
2165 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2168 : (unsigned)AMDGPUISD::FMAD_FTZ;
2169
2170 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2171 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2172 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2173 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2174 Cvt_Lo);
2175 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2176 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2177 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2178 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2179 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2180 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2181 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2182 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2183 Mul1);
2184 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2185 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2186 SDValue Rcp64 = DAG.getBitcast(VT,
2187 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2188
2189 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2190 SDValue One64 = DAG.getConstant(1, DL, VT);
2191 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2192 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2193
2194 // First round of UNR (Unsigned integer Newton-Raphson).
2195 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2196 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2197 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2198 SDValue Mulhi1_Lo, Mulhi1_Hi;
2199 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2200 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2201 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2202 Mulhi1_Lo, Zero1);
2203 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2204 Mulhi1_Hi, Add1_Lo.getValue(1));
2205 SDValue Add1 = DAG.getBitcast(VT,
2206 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2207
2208 // Second round of UNR.
2209 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2210 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2211 SDValue Mulhi2_Lo, Mulhi2_Hi;
2212 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2213 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2214 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2215 Mulhi2_Lo, Zero1);
2216 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2217 Mulhi2_Hi, Add2_Lo.getValue(1));
2218 SDValue Add2 = DAG.getBitcast(VT,
2219 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2220
2221 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2222
2223 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2224
2225 SDValue Mul3_Lo, Mul3_Hi;
2226 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2227 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2228 Mul3_Lo, Zero1);
2229 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2230 Mul3_Hi, Sub1_Lo.getValue(1));
2231 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2232 SDValue Sub1 = DAG.getBitcast(VT,
2233 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2234
2235 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2236 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2237 ISD::SETUGE);
2238 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2239 ISD::SETUGE);
2240 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2241
2242 // TODO: Here and below portions of the code can be enclosed into if/endif.
2243 // Currently control flow is unconditional and we have 4 selects after
2244 // potential endif to substitute PHIs.
2245
2246 // if C3 != 0 ...
2247 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2248 RHS_Lo, Zero1);
2249 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2250 RHS_Hi, Sub1_Lo.getValue(1));
2251 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2252 Zero, Sub2_Lo.getValue(1));
2253 SDValue Sub2 = DAG.getBitcast(VT,
2254 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2255
2256 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2257
2258 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2259 ISD::SETUGE);
2260 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2261 ISD::SETUGE);
2262 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2263
2264 // if (C6 != 0)
2265 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2266
2267 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2268 RHS_Lo, Zero1);
2269 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2270 RHS_Hi, Sub2_Lo.getValue(1));
2271 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2272 Zero, Sub3_Lo.getValue(1));
2273 SDValue Sub3 = DAG.getBitcast(VT,
2274 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2275
2276 // endif C6
2277 // endif C3
2278
2279 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2280 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2281
2282 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2283 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2284
2285 Results.push_back(Div);
2286 Results.push_back(Rem);
2287
2288 return;
2289 }
2290
2291 // r600 expandion.
2292 // Get Speculative values
2293 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2294 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2295
2296 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2297 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2298 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2299
2300 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2301 SDValue DIV_Lo = Zero;
2302
2303 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2304
2305 for (unsigned i = 0; i < halfBitWidth; ++i) {
2306 const unsigned bitPos = halfBitWidth - i - 1;
2307 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2308 // Get value of high bit
2309 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2310 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2311 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2312
2313 // Shift
2314 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2315 // Add LHS high bit
2316 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2317
2318 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2319 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2320
2321 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2322
2323 // Update REM
2324 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2325 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2326 }
2327
2328 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2329 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2330 Results.push_back(DIV);
2331 Results.push_back(REM);
2332}
2333
2335 SelectionDAG &DAG) const {
2336 SDLoc DL(Op);
2337 EVT VT = Op.getValueType();
2338
2339 if (VT == MVT::i64) {
2341 LowerUDIVREM64(Op, DAG, Results);
2342 return DAG.getMergeValues(Results, DL);
2343 }
2344
2345 if (VT == MVT::i32) {
2346 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2347 return Res;
2348 }
2349
2350 SDValue X = Op.getOperand(0);
2351 SDValue Y = Op.getOperand(1);
2352
2353 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2354 // algorithm used here.
2355
2356 // Initial estimate of inv(y).
2357 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2358
2359 // One round of UNR.
2360 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2361 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2362 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2363 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2364
2365 // Quotient/remainder estimate.
2366 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2367 SDValue R =
2368 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2369
2370 // First quotient/remainder refinement.
2371 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2372 SDValue One = DAG.getConstant(1, DL, VT);
2373 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2374 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2375 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2376 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2377 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2378
2379 // Second quotient/remainder refinement.
2380 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2381 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2382 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2383 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2384 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2385
2386 return DAG.getMergeValues({Q, R}, DL);
2387}
2388
2390 SelectionDAG &DAG) const {
2391 SDLoc DL(Op);
2392 EVT VT = Op.getValueType();
2393
2394 SDValue LHS = Op.getOperand(0);
2395 SDValue RHS = Op.getOperand(1);
2396
2397 SDValue Zero = DAG.getConstant(0, DL, VT);
2398 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2399
2400 if (VT == MVT::i32) {
2401 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2402 return Res;
2403 }
2404
2405 if (VT == MVT::i64 &&
2406 DAG.ComputeNumSignBits(LHS) > 32 &&
2407 DAG.ComputeNumSignBits(RHS) > 32) {
2408 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2409
2410 //HiLo split
2411 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2412 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2413 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2414 LHS_Lo, RHS_Lo);
2415 SDValue Res[2] = {
2416 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2417 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2418 };
2419 return DAG.getMergeValues(Res, DL);
2420 }
2421
2422 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2423 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2424 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2425 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2426
2427 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2428 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2429
2430 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2431 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2432
2433 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2434 SDValue Rem = Div.getValue(1);
2435
2436 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2437 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2438
2439 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2440 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2441
2442 SDValue Res[2] = {
2443 Div,
2444 Rem
2445 };
2446 return DAG.getMergeValues(Res, DL);
2447}
2448
2450 SDLoc SL(Op);
2451 SDValue Src = Op.getOperand(0);
2452
2453 // result = trunc(src)
2454 // if (src > 0.0 && src != result)
2455 // result += 1.0
2456
2457 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2458
2459 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2460 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2461
2462 EVT SetCCVT =
2463 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2464
2465 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2466 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2467 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2468
2469 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2470 // TODO: Should this propagate fast-math-flags?
2471 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2472}
2473
2475 SelectionDAG &DAG) {
2476 const unsigned FractBits = 52;
2477 const unsigned ExpBits = 11;
2478
2479 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2480 Hi,
2481 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2482 DAG.getConstant(ExpBits, SL, MVT::i32));
2483 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2484 DAG.getConstant(1023, SL, MVT::i32));
2485
2486 return Exp;
2487}
2488
2490 SDLoc SL(Op);
2491 SDValue Src = Op.getOperand(0);
2492
2493 assert(Op.getValueType() == MVT::f64);
2494
2495 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2496
2497 // Extract the upper half, since this is where we will find the sign and
2498 // exponent.
2499 SDValue Hi = getHiHalf64(Src, DAG);
2500
2501 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2502
2503 const unsigned FractBits = 52;
2504
2505 // Extract the sign bit.
2506 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2507 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2508
2509 // Extend back to 64-bits.
2510 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2511 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2512
2513 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2514 const SDValue FractMask
2515 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2516
2517 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2518 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2519 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2520
2521 EVT SetCCVT =
2522 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2523
2524 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2525
2526 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2527 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2528
2529 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2530 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2531
2532 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2533}
2534
2536 SelectionDAG &DAG) const {
2537 SDLoc SL(Op);
2538 SDValue Src = Op.getOperand(0);
2539
2540 assert(Op.getValueType() == MVT::f64);
2541
2542 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2543 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2544 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2545
2546 // TODO: Should this propagate fast-math-flags?
2547
2548 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2549 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2550
2551 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2552
2553 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2554 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2555
2556 EVT SetCCVT =
2557 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2558 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2559
2560 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2561}
2562
2564 SelectionDAG &DAG) const {
2565 // FNEARBYINT and FRINT are the same, except in their handling of FP
2566 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2567 // rint, so just treat them as equivalent.
2568 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2569 Op.getOperand(0));
2570}
2571
2573 auto VT = Op.getValueType();
2574 auto Arg = Op.getOperand(0u);
2575 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2576}
2577
2578// XXX - May require not supporting f32 denormals?
2579
2580// Don't handle v2f16. The extra instructions to scalarize and repack around the
2581// compare and vselect end up producing worse code than scalarizing the whole
2582// operation.
2584 SDLoc SL(Op);
2585 SDValue X = Op.getOperand(0);
2586 EVT VT = Op.getValueType();
2587
2588 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2589
2590 // TODO: Should this propagate fast-math-flags?
2591
2592 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2593
2594 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2595
2596 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2597 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2598
2599 EVT SetCCVT =
2600 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2601
2602 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2603 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2604 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2605
2606 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2607 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2608}
2609
2611 SDLoc SL(Op);
2612 SDValue Src = Op.getOperand(0);
2613
2614 // result = trunc(src);
2615 // if (src < 0.0 && src != result)
2616 // result += -1.0.
2617
2618 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2619
2620 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2621 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2622
2623 EVT SetCCVT =
2624 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2625
2626 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2627 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2628 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2629
2630 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2631 // TODO: Should this propagate fast-math-flags?
2632 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2633}
2634
2635/// Return true if it's known that \p Src can never be an f32 denormal value.
2637 switch (Src.getOpcode()) {
2638 case ISD::FP_EXTEND:
2639 return Src.getOperand(0).getValueType() == MVT::f16;
2640 case ISD::FP16_TO_FP:
2641 case ISD::FFREXP:
2642 case ISD::FSQRT:
2643 case AMDGPUISD::LOG:
2644 case AMDGPUISD::EXP:
2645 return true;
2647 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2648 switch (IntrinsicID) {
2649 case Intrinsic::amdgcn_frexp_mant:
2650 case Intrinsic::amdgcn_log:
2651 case Intrinsic::amdgcn_log_clamp:
2652 case Intrinsic::amdgcn_exp2:
2653 case Intrinsic::amdgcn_sqrt:
2654 return true;
2655 default:
2656 return false;
2657 }
2658 }
2659 default:
2660 return false;
2661 }
2662
2663 llvm_unreachable("covered opcode switch");
2664}
2665
2667 SDNodeFlags Flags) {
2668 return Flags.hasApproximateFuncs();
2669}
2670
2679
2681 SDValue Src,
2682 SDNodeFlags Flags) const {
2683 SDLoc SL(Src);
2684 EVT VT = Src.getValueType();
2685 const fltSemantics &Semantics = VT.getFltSemantics();
2686 SDValue SmallestNormal =
2687 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2688
2689 // Want to scale denormals up, but negatives and 0 work just as well on the
2690 // scaled path.
2691 SDValue IsLtSmallestNormal = DAG.getSetCC(
2692 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2693 SmallestNormal, ISD::SETOLT);
2694
2695 return IsLtSmallestNormal;
2696}
2697
2699 SDNodeFlags Flags) const {
2700 SDLoc SL(Src);
2701 EVT VT = Src.getValueType();
2702 const fltSemantics &Semantics = VT.getFltSemantics();
2703 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2704
2705 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2706 SDValue IsFinite = DAG.getSetCC(
2707 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2708 Inf, ISD::SETOLT);
2709 return IsFinite;
2710}
2711
2712/// If denormal handling is required return the scaled input to FLOG2, and the
2713/// check for denormal range. Otherwise, return null values.
2714std::pair<SDValue, SDValue>
2716 SDValue Src, SDNodeFlags Flags) const {
2717 if (!needsDenormHandlingF32(DAG, Src, Flags))
2718 return {};
2719
2720 MVT VT = MVT::f32;
2721 const fltSemantics &Semantics = APFloat::IEEEsingle();
2722 SDValue SmallestNormal =
2723 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2724
2725 SDValue IsLtSmallestNormal = DAG.getSetCC(
2726 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2727 SmallestNormal, ISD::SETOLT);
2728
2729 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2730 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2731 SDValue ScaleFactor =
2732 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2733
2734 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2735 return {ScaledInput, IsLtSmallestNormal};
2736}
2737
2739 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2740 // If we have to handle denormals, scale up the input and adjust the result.
2741
2742 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2743 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2744
2745 SDLoc SL(Op);
2746 EVT VT = Op.getValueType();
2747 SDValue Src = Op.getOperand(0);
2748 SDNodeFlags Flags = Op->getFlags();
2749
2750 if (VT == MVT::f16) {
2751 // Nothing in half is a denormal when promoted to f32.
2752 assert(!isTypeLegal(VT));
2753 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2754 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2755 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2756 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2757 }
2758
2759 auto [ScaledInput, IsLtSmallestNormal] =
2760 getScaledLogInput(DAG, SL, Src, Flags);
2761 if (!ScaledInput)
2762 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2763
2764 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2765
2766 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2767 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2768 SDValue ResultOffset =
2769 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2770 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2771}
2772
2773static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2774 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2775 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2776 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2777}
2778
2780 SelectionDAG &DAG) const {
2781 SDValue X = Op.getOperand(0);
2782 EVT VT = Op.getValueType();
2783 SDNodeFlags Flags = Op->getFlags();
2784 SDLoc DL(Op);
2785 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2786 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2787
2788 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2789 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2790 // depending on !fpmath metadata.
2791
2792 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2793 !isTypeLegal(MVT::f16));
2794
2795 if (PromoteToF32) {
2796 // Log and multiply in f32 is always good enough for f16.
2797 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2798 }
2799
2800 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2801 if (PromoteToF32) {
2802 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2803 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2804 }
2805
2806 return Lowered;
2807 }
2808
2809 SDValue ScaledInput, IsScaled;
2810 if (VT == MVT::f16)
2811 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2812 else {
2813 std::tie(ScaledInput, IsScaled) = getScaledLogInput(DAG, DL, X, Flags);
2814 if (ScaledInput)
2815 X = ScaledInput;
2816 }
2817
2818 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2819
2820 SDValue R;
2821 if (Subtarget->hasFastFMAF32()) {
2822 // c+cc are ln(2)/ln(10) to more than 49 bits
2823 const float c_log10 = 0x1.344134p-2f;
2824 const float cc_log10 = 0x1.09f79ep-26f;
2825
2826 // c + cc is ln(2) to more than 49 bits
2827 const float c_log = 0x1.62e42ep-1f;
2828 const float cc_log = 0x1.efa39ep-25f;
2829
2830 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2831 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2832 // This adds correction terms for which contraction may lead to an increase
2833 // in the error of the approximation, so disable it.
2834 Flags.setAllowContract(false);
2835 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2836 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2837 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2838 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2839 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2840 } else {
2841 // ch+ct is ln(2)/ln(10) to more than 36 bits
2842 const float ch_log10 = 0x1.344000p-2f;
2843 const float ct_log10 = 0x1.3509f6p-18f;
2844
2845 // ch + ct is ln(2) to more than 36 bits
2846 const float ch_log = 0x1.62e000p-1f;
2847 const float ct_log = 0x1.0bfbe8p-15f;
2848
2849 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2850 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2851
2852 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2853 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2854 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2855 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2856 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2857 // This adds correction terms for which contraction may lead to an increase
2858 // in the error of the approximation, so disable it.
2859 Flags.setAllowContract(false);
2860 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2861 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2862 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2863 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2864 }
2865
2866 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2867
2868 // TODO: Check if known finite from source value.
2869 if (!IsFiniteOnly) {
2870 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2871 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2872 }
2873
2874 if (IsScaled) {
2875 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2876 SDValue ShiftK =
2877 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2878 SDValue Shift =
2879 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2880 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2881 }
2882
2883 return R;
2884}
2885
2889
2890// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2891// promote f16 operation.
2893 SelectionDAG &DAG, bool IsLog10,
2894 SDNodeFlags Flags) const {
2895 EVT VT = Src.getValueType();
2896 unsigned LogOp =
2897 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2898
2899 double Log2BaseInverted =
2901
2902 if (VT == MVT::f32) {
2903 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2904 if (ScaledInput) {
2905 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2906 SDValue ScaledResultOffset =
2907 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2908
2909 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2910
2911 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2912 ScaledResultOffset, Zero, Flags);
2913
2914 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2915
2916 if (Subtarget->hasFastFMAF32())
2917 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2918 Flags);
2919 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2920 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2921 }
2922 }
2923
2924 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2925 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2926
2927 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2928 Flags);
2929}
2930
2931// This expansion gives a result slightly better than 1ulp.
2933 SelectionDAG &DAG) const {
2934 SDLoc DL(Op);
2935 SDValue X = Op.getOperand(0);
2936
2937 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2938 // exp10, which slightly increases ulp.
2939 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2940
2941 SDValue DN, F, T;
2942
2943 if (Op.getOpcode() == ISD::FEXP2) {
2944 // dn = rint(x)
2945 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, X, Flags);
2946 // f = x - dn
2947 F = DAG.getNode(ISD::FSUB, DL, MVT::f64, X, DN, Flags);
2948 // t = f*C1 + f*C2
2949 SDValue C1 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2950 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2951 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C2, Flags);
2952 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C1, Mul2, Flags);
2953 } else if (Op.getOpcode() == ISD::FEXP10) {
2954 // dn = rint(x * C1)
2955 SDValue C1 = DAG.getConstantFP(0x1.a934f0979a371p+1, DL, MVT::f64);
2956 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2957 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2958
2959 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2960 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2961 SDValue C2 = DAG.getConstantFP(-0x1.9dc1da994fd21p-59, DL, MVT::f64);
2962 SDValue C3 = DAG.getConstantFP(0x1.34413509f79ffp-2, DL, MVT::f64);
2963 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2964 F = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2965
2966 // t = FMA(f, C4, f*C5)
2967 SDValue C4 = DAG.getConstantFP(0x1.26bb1bbb55516p+1, DL, MVT::f64);
2968 SDValue C5 = DAG.getConstantFP(-0x1.f48ad494ea3e9p-53, DL, MVT::f64);
2969 SDValue MulF = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C5, Flags);
2970 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C4, MulF, Flags);
2971 } else { // ISD::FEXP
2972 // dn = rint(x * C1)
2973 SDValue C1 = DAG.getConstantFP(0x1.71547652b82fep+0, DL, MVT::f64);
2974 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2975 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2976
2977 // t = FMA(-dn, C2, FMA(-dn, C3, x))
2978 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2979 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2980 SDValue C3 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2981 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2982 T = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2983 }
2984
2985 // Polynomial expansion for p
2986 SDValue P = DAG.getConstantFP(0x1.ade156a5dcb37p-26, DL, MVT::f64);
2987 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2988 DAG.getConstantFP(0x1.28af3fca7ab0cp-22, DL, MVT::f64),
2989 Flags);
2990 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2991 DAG.getConstantFP(0x1.71dee623fde64p-19, DL, MVT::f64),
2992 Flags);
2993 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2994 DAG.getConstantFP(0x1.a01997c89e6b0p-16, DL, MVT::f64),
2995 Flags);
2996 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2997 DAG.getConstantFP(0x1.a01a014761f6ep-13, DL, MVT::f64),
2998 Flags);
2999 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3000 DAG.getConstantFP(0x1.6c16c1852b7b0p-10, DL, MVT::f64),
3001 Flags);
3002 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3003 DAG.getConstantFP(0x1.1111111122322p-7, DL, MVT::f64), Flags);
3004 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3005 DAG.getConstantFP(0x1.55555555502a1p-5, DL, MVT::f64), Flags);
3006 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3007 DAG.getConstantFP(0x1.5555555555511p-3, DL, MVT::f64), Flags);
3008 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3009 DAG.getConstantFP(0x1.000000000000bp-1, DL, MVT::f64), Flags);
3010
3011 SDValue One = DAG.getConstantFP(1.0, DL, MVT::f64);
3012
3013 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3014 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3015
3016 // z = ldexp(p, (int)dn)
3017 SDValue DNInt = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32, DN);
3018 SDValue Z = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, P, DNInt, Flags);
3019
3020 // Overflow/underflow guards
3021 SDValue CondHi = DAG.getSetCC(
3022 DL, MVT::i1, X, DAG.getConstantFP(1024.0, DL, MVT::f64), ISD::SETULE);
3023
3024 if (!Flags.hasNoInfs()) {
3025 SDValue PInf = DAG.getConstantFP(std::numeric_limits<double>::infinity(),
3026 DL, MVT::f64);
3027 Z = DAG.getSelect(DL, MVT::f64, CondHi, Z, PInf, Flags);
3028 }
3029
3030 SDValue CondLo = DAG.getSetCC(
3031 DL, MVT::i1, X, DAG.getConstantFP(-1075.0, DL, MVT::f64), ISD::SETUGE);
3032 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
3033 Z = DAG.getSelect(DL, MVT::f64, CondLo, Z, Zero, Flags);
3034
3035 return Z;
3036}
3037
3039 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3040 // If we have to handle denormals, scale up the input and adjust the result.
3041
3042 EVT VT = Op.getValueType();
3043 if (VT == MVT::f64)
3044 return lowerFEXPF64(Op, DAG);
3045
3046 SDLoc SL(Op);
3047 SDValue Src = Op.getOperand(0);
3048 SDNodeFlags Flags = Op->getFlags();
3049
3050 if (VT == MVT::f16) {
3051 // Nothing in half is a denormal when promoted to f32.
3052 assert(!isTypeLegal(MVT::f16));
3053 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
3054 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
3055 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
3056 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3057 }
3058
3059 assert(VT == MVT::f32);
3060
3061 if (!needsDenormHandlingF32(DAG, Src, Flags))
3062 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
3063
3064 // bool needs_scaling = x < -0x1.f80000p+6f;
3065 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3066
3067 // -nextafter(128.0, -1)
3068 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
3069
3070 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3071
3072 SDValue NeedsScaling =
3073 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
3074
3075 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3076 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3077
3078 SDValue AddOffset =
3079 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
3080
3081 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
3082 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
3083
3084 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
3085 SDValue One = DAG.getConstantFP(1.0, SL, VT);
3086 SDValue ResultScale =
3087 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
3088
3089 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
3090}
3091
3093 SelectionDAG &DAG,
3094 SDNodeFlags Flags,
3095 bool IsExp10) const {
3096 // exp(x) -> exp2(M_LOG2E_F * x);
3097 // exp10(x) -> exp2(log2(10) * x);
3098 EVT VT = X.getValueType();
3099 SDValue Const =
3100 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
3101
3102 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
3103 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3104 : (unsigned)ISD::FEXP2,
3105 SL, VT, Mul, Flags);
3106}
3107
3109 SelectionDAG &DAG,
3110 SDNodeFlags Flags) const {
3111 EVT VT = X.getValueType();
3112 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
3113 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3114
3115 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3116
3117 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
3118 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3119
3120 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3121
3122 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3123
3124 SDValue AdjustedX =
3125 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3126
3127 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3128 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3129
3130 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3131
3132 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3133 SDValue AdjustedResult =
3134 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3135
3136 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3137 Flags);
3138}
3139
3140/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3141/// handled correctly.
3143 SelectionDAG &DAG,
3144 SDNodeFlags Flags) const {
3145 const EVT VT = X.getValueType();
3146
3147 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3148 : static_cast<unsigned>(ISD::FEXP2);
3149
3150 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3151 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3152 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3153 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3154
3155 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3156 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3157 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3158 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3159 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3160 }
3161
3162 // bool s = x < -0x1.2f7030p+5f;
3163 // x += s ? 0x1.0p+5f : 0.0f;
3164 // exp10 = exp2(x * 0x1.a92000p+1f) *
3165 // exp2(x * 0x1.4f0978p-11f) *
3166 // (s ? 0x1.9f623ep-107f : 1.0f);
3167
3168 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3169
3170 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3171 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3172
3173 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3174 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3175 SDValue AdjustedX =
3176 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3177
3178 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3179 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3180
3181 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3182 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3183 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3184 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3185
3186 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3187
3188 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3189 SDValue AdjustedResult =
3190 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3191
3192 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3193 Flags);
3194}
3195
3197 EVT VT = Op.getValueType();
3198
3199 if (VT == MVT::f64)
3200 return lowerFEXPF64(Op, DAG);
3201
3202 SDLoc SL(Op);
3203 SDValue X = Op.getOperand(0);
3204 SDNodeFlags Flags = Op->getFlags();
3205 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3206
3207 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3208 // library behavior. Also, is known-not-daz source sufficient?
3209 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3210 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3211 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3212 }
3213
3214 if (VT.getScalarType() == MVT::f16) {
3215 if (VT.isVector())
3216 return SDValue();
3217
3218 // Nothing in half is a denormal when promoted to f32.
3219 //
3220 // exp(f16 x) ->
3221 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3222 //
3223 // exp10(f16 x) ->
3224 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3225 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3226 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3227 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3228 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3229 }
3230
3231 assert(VT == MVT::f32);
3232
3233 // Algorithm:
3234 //
3235 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3236 //
3237 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3238 // n = 64*m + j, 0 <= j < 64
3239 //
3240 // e^x = 2^((64*m + j + f)/64)
3241 // = (2^m) * (2^(j/64)) * 2^(f/64)
3242 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3243 //
3244 // f = x*(64/ln(2)) - n
3245 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3246 //
3247 // e^x = (2^m) * (2^(j/64)) * e^r
3248 //
3249 // (2^(j/64)) is precomputed
3250 //
3251 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3252 // e^r = 1 + q
3253 //
3254 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3255 //
3256 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3257 SDNodeFlags FlagsNoContract = Flags;
3258 FlagsNoContract.setAllowContract(false);
3259
3260 SDValue PH, PL;
3261 if (Subtarget->hasFastFMAF32()) {
3262 const float c_exp = numbers::log2ef;
3263 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3264 const float c_exp10 = 0x1.a934f0p+1f;
3265 const float cc_exp10 = 0x1.2f346ep-24f;
3266
3267 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3268 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3269
3270 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3271 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3272 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3273 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3274 } else {
3275 const float ch_exp = 0x1.714000p+0f;
3276 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3277
3278 const float ch_exp10 = 0x1.a92000p+1f;
3279 const float cl_exp10 = 0x1.4f0978p-11f;
3280
3281 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3282 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3283
3284 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3285 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3286 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3287 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3288 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3289
3290 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3291
3292 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3293 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3294 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3295 }
3296
3297 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3298
3299 // It is unsafe to contract this fsub into the PH multiply.
3300 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3301
3302 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3303 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3304 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3305
3306 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3307
3308 SDValue UnderflowCheckConst =
3309 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3310
3311 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3312 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3313 SDValue Underflow =
3314 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3315
3316 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3317
3318 if (!Flags.hasNoInfs()) {
3319 SDValue OverflowCheckConst =
3320 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3321 SDValue Overflow =
3322 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3323 SDValue Inf =
3325 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3326 }
3327
3328 return R;
3329}
3330
3331static bool isCtlzOpc(unsigned Opc) {
3332 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
3333}
3334
3335static bool isCttzOpc(unsigned Opc) {
3336 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_POISON;
3337}
3338
3340 SelectionDAG &DAG) const {
3341 auto SL = SDLoc(Op);
3342 auto Opc = Op.getOpcode();
3343 auto Arg = Op.getOperand(0u);
3344 auto ResultVT = Op.getValueType();
3345
3346 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3347 return {};
3348
3350 assert(ResultVT == Arg.getValueType());
3351
3352 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3353 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3354 SDValue NewOp;
3355
3356 if (Opc == ISD::CTLZ_ZERO_POISON) {
3357 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3358 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3359 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3360 } else {
3361 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3362 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3363 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3364 }
3365
3366 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3367}
3368
3370 SDLoc SL(Op);
3371 SDValue Src = Op.getOperand(0);
3372
3373 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3374 bool Ctlz = isCtlzOpc(Op.getOpcode());
3375 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3376
3377 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_POISON ||
3378 Op.getOpcode() == ISD::CTTZ_ZERO_POISON;
3379 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3380
3381 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3382 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3383 // (cttz hi:lo) -> (umin (ffbl src), 32)
3384 // (ctlz_zero_poison src) -> (ffbh src)
3385 // (cttz_zero_poison src) -> (ffbl src)
3386
3387 // 64-bit scalar version produce 32-bit result
3388 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3389 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3390 // (ctlz_zero_poison src) -> (S_FLBIT_I32_B64 src)
3391 // (cttz_zero_poison src) -> (S_FF1_I32_B64 src)
3392 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3393 if (!ZeroUndef) {
3394 const SDValue ConstVal = DAG.getConstant(
3395 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3396 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3397 }
3398 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3399 }
3400
3401 SDValue Lo, Hi;
3402 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3403
3404 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3405 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3406
3407 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3408 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3409 // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3410 // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3411
3412 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3413 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3414 if (Ctlz)
3415 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3416 else
3417 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3418
3419 SDValue NewOpr;
3420 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3421 if (!ZeroUndef) {
3422 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3423 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3424 }
3425
3426 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3427}
3428
3430 SDLoc SL(Op);
3431 SDValue Src = Op.getOperand(0);
3432 assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
3433 SDValue Ffbh = DAG.getNode(
3434 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3435 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src);
3436 SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh,
3437 DAG.getConstant(32, SL, MVT::i32));
3438 return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped,
3439 DAG.getAllOnesConstant(SL, MVT::i32));
3440}
3441
3443 bool Signed) const {
3444 // The regular method converting a 64-bit integer to float roughly consists of
3445 // 2 steps: normalization and rounding. In fact, after normalization, the
3446 // conversion from a 64-bit integer to a float is essentially the same as the
3447 // one from a 32-bit integer. The only difference is that it has more
3448 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3449 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3450 // converted into the correct float number. The basic steps for the unsigned
3451 // conversion are illustrated in the following pseudo code:
3452 //
3453 // f32 uitofp(i64 u) {
3454 // i32 hi, lo = split(u);
3455 // // Only count the leading zeros in hi as we have native support of the
3456 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3457 // // reduced to a 32-bit one automatically.
3458 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3459 // u <<= shamt;
3460 // hi, lo = split(u);
3461 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3462 // // convert it as a 32-bit integer and scale the result back.
3463 // return uitofp(hi) * 2^(32 - shamt);
3464 // }
3465 //
3466 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3467 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3468 // converted instead followed by negation based its sign bit.
3469
3470 SDLoc SL(Op);
3471 SDValue Src = Op.getOperand(0);
3472
3473 SDValue Lo, Hi;
3474 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3475 SDValue Sign;
3476 SDValue ShAmt;
3477 if (Signed && Subtarget->isGCN()) {
3478 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3479 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3480 // account. That is, the maximal shift is
3481 // - 32 if Lo and Hi have opposite signs;
3482 // - 33 if Lo and Hi have the same sign.
3483 //
3484 // Or, MaxShAmt = 33 + OppositeSign, where
3485 //
3486 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3487 // - -1 if Lo and Hi have opposite signs; and
3488 // - 0 otherwise.
3489 //
3490 // All in all, ShAmt is calculated as
3491 //
3492 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3493 //
3494 // or
3495 //
3496 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3497 //
3498 // to reduce the critical path.
3499 SDValue OppositeSign = DAG.getNode(
3500 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3501 DAG.getConstant(31, SL, MVT::i32));
3502 SDValue MaxShAmt =
3503 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3504 OppositeSign);
3505 // Count the leading sign bits.
3506 ShAmt = DAG.getNode(
3507 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3508 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi);
3509 // Different from unsigned conversion, the shift should be one bit less to
3510 // preserve the sign bit.
3511 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3512 DAG.getConstant(1, SL, MVT::i32));
3513 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3514 } else {
3515 if (Signed) {
3516 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3517 // absolute value first.
3518 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3519 DAG.getConstant(63, SL, MVT::i64));
3520 SDValue Abs =
3521 DAG.getNode(ISD::XOR, SL, MVT::i64,
3522 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3523 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3524 }
3525 // Count the leading zeros.
3526 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3527 // The shift amount for signed integers is [0, 32].
3528 }
3529 // Normalize the given 64-bit integer.
3530 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3531 // Split it again.
3532 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3533 // Calculate the adjust bit for rounding.
3534 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3535 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3536 DAG.getConstant(1, SL, MVT::i32), Lo);
3537 // Get the 32-bit normalized integer.
3538 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3539 // Convert the normalized 32-bit integer into f32.
3540
3541 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3542 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3543 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3544
3545 // Finally, need to scale back the converted floating number as the original
3546 // 64-bit integer is converted as a 32-bit one.
3547 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3548 ShAmt);
3549 // On GCN, use LDEXP directly.
3550 if (UseLDEXP)
3551 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3552
3553 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3554 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3555 // exponent is enough to avoid overflowing into the sign bit.
3556 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3557 DAG.getConstant(23, SL, MVT::i32));
3558 SDValue IVal =
3559 DAG.getNode(ISD::ADD, SL, MVT::i32,
3560 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3561 if (Signed) {
3562 // Set the sign bit.
3563 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3564 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3565 DAG.getConstant(31, SL, MVT::i32));
3566 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3567 }
3568 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3569}
3570
3572 bool Signed) const {
3573 SDLoc SL(Op);
3574 SDValue Src = Op.getOperand(0);
3575
3576 SDValue Lo, Hi;
3577 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3578
3580 SL, MVT::f64, Hi);
3581
3582 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3583
3584 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3585 DAG.getConstant(32, SL, MVT::i32));
3586 // TODO: Should this propagate fast-math-flags?
3587 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3588}
3589
3591 SelectionDAG &DAG) const {
3592 // TODO: Factor out code common with LowerSINT_TO_FP.
3593 EVT DestVT = Op.getValueType();
3594 SDValue Src = Op.getOperand(0);
3595 EVT SrcVT = Src.getValueType();
3596
3597 if (SrcVT == MVT::i16) {
3598 if (DestVT == MVT::f16)
3599 return Op;
3600 SDLoc DL(Op);
3601
3602 // Promote src to i32
3603 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3604 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3605 }
3606
3607 if (DestVT == MVT::bf16) {
3608 SDLoc SL(Op);
3609 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3610 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3611 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3612 }
3613
3614 if (SrcVT != MVT::i64)
3615 return Op;
3616
3617 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3618 SDLoc DL(Op);
3619
3620 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3621 SDValue FPRoundFlag =
3622 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3623 SDValue FPRound =
3624 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3625
3626 return FPRound;
3627 }
3628
3629 if (DestVT == MVT::f32)
3630 return LowerINT_TO_FP32(Op, DAG, false);
3631
3632 assert(DestVT == MVT::f64);
3633 return LowerINT_TO_FP64(Op, DAG, false);
3634}
3635
3637 SelectionDAG &DAG) const {
3638 EVT DestVT = Op.getValueType();
3639
3640 SDValue Src = Op.getOperand(0);
3641 EVT SrcVT = Src.getValueType();
3642
3643 if (SrcVT == MVT::i16) {
3644 if (DestVT == MVT::f16)
3645 return Op;
3646
3647 SDLoc DL(Op);
3648 // Promote src to i32
3649 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3650 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3651 }
3652
3653 if (DestVT == MVT::bf16) {
3654 SDLoc SL(Op);
3655 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3656 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3657 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3658 }
3659
3660 if (SrcVT != MVT::i64)
3661 return Op;
3662
3663 // TODO: Factor out code common with LowerUINT_TO_FP.
3664
3665 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3666 SDLoc DL(Op);
3667 SDValue Src = Op.getOperand(0);
3668
3669 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3670 SDValue FPRoundFlag =
3671 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3672 SDValue FPRound =
3673 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3674
3675 return FPRound;
3676 }
3677
3678 if (DestVT == MVT::f32)
3679 return LowerINT_TO_FP32(Op, DAG, true);
3680
3681 assert(DestVT == MVT::f64);
3682 return LowerINT_TO_FP64(Op, DAG, true);
3683}
3684
3686 bool Signed) const {
3687 SDLoc SL(Op);
3688
3689 SDValue Src = Op.getOperand(0);
3690 EVT SrcVT = Src.getValueType();
3691
3692 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3693
3694 // The basic idea of converting a floating point number into a pair of 32-bit
3695 // integers is illustrated as follows:
3696 //
3697 // tf := trunc(val);
3698 // hif := floor(tf * 2^-32);
3699 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3700 // hi := fptoi(hif);
3701 // lo := fptoi(lof);
3702 //
3703 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3704 SDValue Sign;
3705 if (Signed && SrcVT == MVT::f32) {
3706 // However, a 32-bit floating point number has only 23 bits mantissa and
3707 // it's not enough to hold all the significant bits of `lof` if val is
3708 // negative. To avoid the loss of precision, We need to take the absolute
3709 // value after truncating and flip the result back based on the original
3710 // signedness.
3711 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3712 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3713 DAG.getConstant(31, SL, MVT::i32));
3714 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3715 }
3716
3717 SDValue K0, K1;
3718 if (SrcVT == MVT::f64) {
3719 K0 = DAG.getConstantFP(
3720 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3721 SrcVT);
3722 K1 = DAG.getConstantFP(
3723 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3724 SrcVT);
3725 } else {
3726 K0 = DAG.getConstantFP(
3727 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3728 K1 = DAG.getConstantFP(
3729 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3730 }
3731 // TODO: Should this propagate fast-math-flags?
3732 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3733
3734 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3735
3736 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3737
3738 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3740 SL, MVT::i32, FloorMul);
3741 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3742
3743 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3744 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3745
3746 if (Signed && SrcVT == MVT::f32) {
3747 assert(Sign);
3748 // Flip the result based on the signedness, which is either all 0s or 1s.
3749 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3750 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3751 // r := xor(r, sign) - sign;
3752 Result =
3753 DAG.getNode(ISD::SUB, SL, MVT::i64,
3754 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3755 }
3756
3757 return Result;
3758}
3759
3761 SDLoc DL(Op);
3762 SDValue N0 = Op.getOperand(0);
3763
3764 // Convert to target node to get known bits
3765 if (N0.getValueType() == MVT::f32)
3766 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3767
3768 if (Op->getFlags().hasApproximateFuncs()) {
3769 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3770 return SDValue();
3771 }
3772
3773 return LowerF64ToF16Safe(N0, DL, DAG);
3774}
3775
3776// return node in i32
3778 SelectionDAG &DAG) const {
3779 assert(Src.getSimpleValueType() == MVT::f64);
3780
3781 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3782 // TODO: We can generate better code for True16.
3783 const unsigned ExpMask = 0x7ff;
3784 const unsigned ExpBiasf64 = 1023;
3785 const unsigned ExpBiasf16 = 15;
3786 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3787 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3788 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3789 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3790 DAG.getConstant(32, DL, MVT::i64));
3791 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3792 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3793 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3794 DAG.getConstant(20, DL, MVT::i64));
3795 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3796 DAG.getConstant(ExpMask, DL, MVT::i32));
3797 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3798 // add the f16 bias (15) to get the biased exponent for the f16 format.
3799 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3800 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3801
3802 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3803 DAG.getConstant(8, DL, MVT::i32));
3804 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3805 DAG.getConstant(0xffe, DL, MVT::i32));
3806
3807 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3808 DAG.getConstant(0x1ff, DL, MVT::i32));
3809 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3810
3811 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3812 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3813
3814 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3815 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3816 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3817 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3818
3819 // N = M | (E << 12);
3820 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3821 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3822 DAG.getConstant(12, DL, MVT::i32)));
3823
3824 // B = clamp(1-E, 0, 13);
3825 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3826 One, E);
3827 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3828 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3829 DAG.getConstant(13, DL, MVT::i32));
3830
3831 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3832 DAG.getConstant(0x1000, DL, MVT::i32));
3833
3834 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3835 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3836 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3837 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3838
3839 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3840 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3841 DAG.getConstant(0x7, DL, MVT::i32));
3842 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3843 DAG.getConstant(2, DL, MVT::i32));
3844 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3845 One, Zero, ISD::SETEQ);
3846 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3847 One, Zero, ISD::SETGT);
3848 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3849 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3850
3851 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3852 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3853 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3854 I, V, ISD::SETEQ);
3855
3856 // Extract the sign bit.
3857 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3858 DAG.getConstant(16, DL, MVT::i32));
3859 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3860 DAG.getConstant(0x8000, DL, MVT::i32));
3861
3862 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3863}
3864
3866 SelectionDAG &DAG) const {
3867 SDValue Src = Op.getOperand(0);
3868 unsigned OpOpcode = Op.getOpcode();
3869 EVT SrcVT = Src.getValueType();
3870 EVT DestVT = Op.getValueType();
3871
3872 // Will be selected natively
3873 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3874 return Op;
3875
3876 if (SrcVT == MVT::bf16) {
3877 SDLoc DL(Op);
3878 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3879 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3880 }
3881
3882 // Promote i16 to i32
3883 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3884 SDLoc DL(Op);
3885
3886 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3887 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3888 }
3889
3890 if (DestVT != MVT::i64)
3891 return Op;
3892
3893 if (SrcVT == MVT::f16 ||
3894 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3895 SDLoc DL(Op);
3896
3897 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3898 unsigned Ext =
3900 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3901 }
3902
3903 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3904 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3905
3906 return SDValue();
3907}
3908
3910 SelectionDAG &DAG) const {
3911 SDValue Src = Op.getOperand(0);
3912 unsigned OpOpcode = Op.getOpcode();
3913 EVT SrcVT = Src.getValueType();
3914 EVT DstVT = Op.getValueType();
3915 SDValue SatVTOp = Op.getNode()->getOperand(1);
3916 EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
3917 SDLoc DL(Op);
3918
3919 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3920 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3921 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3922
3923 // Will be selected natively
3924 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3925 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3926 return Op;
3927
3928 if (DstVT == MVT::i16 && SatWidth == DstWidth && SrcVT == MVT::f16)
3929 return Op;
3930
3931 // Perform all saturation at selected width (i16 or i32) and truncate
3932 if (SatWidth < DstWidth && SatWidth <= 32) {
3933 // For f16 conversion with sub-i16 saturation perform saturation
3934 // at i16, if available in the target. This removes the need for extra f16
3935 // to f32 conversion. For all the others use i32.
3936 MVT ResultVT =
3937 Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16
3938 ? MVT::i16
3939 : MVT::i32;
3940
3941 const SDValue ResultVTOp = DAG.getValueType(ResultVT);
3942 const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();
3943
3944 // First, convert input float into selected integer (i16 or i32)
3945 SDValue FpToInt = DAG.getNode(OpOpcode, DL, ResultVT, Src, ResultVTOp);
3946 SDValue IntSatVal;
3947
3948 // Then, clamp at the saturation width using either i16 or i32 instructions
3949 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3950 SDValue MinConst = DAG.getConstant(
3951 APInt::getSignedMaxValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3952 SDValue MaxConst = DAG.getConstant(
3953 APInt::getSignedMinValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3954 SDValue MinVal = DAG.getNode(ISD::SMIN, DL, ResultVT, FpToInt, MinConst);
3955 IntSatVal = DAG.getNode(ISD::SMAX, DL, ResultVT, MinVal, MaxConst);
3956 } else {
3957 SDValue MinConst = DAG.getConstant(
3958 APInt::getMaxValue(SatWidth).zext(ResultWidth), DL, ResultVT);
3959 IntSatVal = DAG.getNode(ISD::UMIN, DL, ResultVT, FpToInt, MinConst);
3960 }
3961
3962 // Finally, after saturating at i16 or i32 fit into the destination type
3963 return DAG.getExtOrTrunc(OpOpcode == ISD::FP_TO_SINT_SAT, IntSatVal, DL,
3964 DstVT);
3965 }
3966
3967 // SatWidth == DstWidth
3968
3969 // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion
3970 // below)
3971 if (DstVT == MVT::i64 &&
3972 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3973 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3974 const SDValue Int32VTOp = DAG.getValueType(MVT::i32);
3975 return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VTOp);
3976 }
3977
3978 // Promote f16/bf16 src to f32 for i32 conversion
3979 if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
3980 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3981 return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
3982 }
3983
3984 // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16
3985 // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32
3986 // saturation; this covers i16.f32 and i16.f64
3987 if (DstWidth < 32) {
3988 // Note: this triggers SatWidth < DstWidth above to generate saturated
3989 // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.
3990 MVT PromoteVT =
3991 (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;
3992 SDValue FpToInt = DAG.getNode(OpOpcode, DL, PromoteVT, Src, SatVTOp);
3993 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt);
3994 }
3995
3996 // TODO: can we implement i64 dst for f32/f64?
3997
3998 return SDValue();
3999}
4000
4002 SelectionDAG &DAG) const {
4003 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4004 MVT VT = Op.getSimpleValueType();
4005 MVT ScalarVT = VT.getScalarType();
4006
4007 assert(VT.isVector());
4008
4009 SDValue Src = Op.getOperand(0);
4010 SDLoc DL(Op);
4011
4012 // TODO: Don't scalarize on Evergreen?
4013 unsigned NElts = VT.getVectorNumElements();
4015 DAG.ExtractVectorElements(Src, Args, 0, NElts);
4016
4017 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
4018 for (unsigned I = 0; I < NElts; ++I)
4019 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
4020
4021 return DAG.getBuildVector(VT, DL, Args);
4022}
4023
4024//===----------------------------------------------------------------------===//
4025// Custom DAG optimizations
4026//===----------------------------------------------------------------------===//
4027
4028static bool isU24(SDValue Op, SelectionDAG &DAG) {
4029 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
4030}
4031
4032static bool isI24(SDValue Op, SelectionDAG &DAG) {
4033 EVT VT = Op.getValueType();
4034 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4035 // as unsigned 24-bit values.
4037}
4038
4041 SelectionDAG &DAG = DCI.DAG;
4042 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4043 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4044
4045 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
4046 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
4047 unsigned NewOpcode = Node24->getOpcode();
4048 if (IsIntrin) {
4049 unsigned IID = Node24->getConstantOperandVal(0);
4050 switch (IID) {
4051 case Intrinsic::amdgcn_mul_i24:
4052 NewOpcode = AMDGPUISD::MUL_I24;
4053 break;
4054 case Intrinsic::amdgcn_mul_u24:
4055 NewOpcode = AMDGPUISD::MUL_U24;
4056 break;
4057 case Intrinsic::amdgcn_mulhi_i24:
4058 NewOpcode = AMDGPUISD::MULHI_I24;
4059 break;
4060 case Intrinsic::amdgcn_mulhi_u24:
4061 NewOpcode = AMDGPUISD::MULHI_U24;
4062 break;
4063 default:
4064 llvm_unreachable("Expected 24-bit mul intrinsic");
4065 }
4066 }
4067
4068 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
4069
4070 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4071 // the operands to have other uses, but will only perform simplifications that
4072 // involve bypassing some nodes for this user.
4073 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
4074 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
4075 if (DemandedLHS || DemandedRHS)
4076 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
4077 DemandedLHS ? DemandedLHS : LHS,
4078 DemandedRHS ? DemandedRHS : RHS);
4079
4080 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4081 // operands if this node is the only user.
4082 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
4083 return SDValue(Node24, 0);
4084 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
4085 return SDValue(Node24, 0);
4086
4087 return SDValue();
4088}
4089
4090template <typename IntTy>
4092 uint32_t Width, const SDLoc &DL) {
4093 if (Width + Offset < 32) {
4094 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4095 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4096 if constexpr (std::is_signed_v<IntTy>) {
4097 return DAG.getSignedConstant(Result, DL, MVT::i32);
4098 } else {
4099 return DAG.getConstant(Result, DL, MVT::i32);
4100 }
4101 }
4102
4103 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4104}
4105
4106static bool hasVolatileUser(SDNode *Val) {
4107 for (SDNode *U : Val->users()) {
4108 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
4109 if (M->isVolatile())
4110 return true;
4111 }
4112 }
4113
4114 return false;
4115}
4116
4118 // i32 vectors are the canonical memory type.
4119 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4120 return false;
4121
4122 if (!VT.isByteSized())
4123 return false;
4124
4125 unsigned Size = VT.getStoreSize();
4126
4127 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4128 return false;
4129
4130 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4131 return false;
4132
4133 return true;
4134}
4135
4136// Replace load of an illegal type with a bitcast from a load of a friendlier
4137// type.
4139 DAGCombinerInfo &DCI) const {
4140 if (!DCI.isBeforeLegalize())
4141 return SDValue();
4142
4144 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
4145 return SDValue();
4146
4147 SDLoc SL(N);
4148 SelectionDAG &DAG = DCI.DAG;
4149 EVT VT = LN->getMemoryVT();
4150
4151 unsigned Size = VT.getStoreSize();
4152 Align Alignment = LN->getAlign();
4153 if (Alignment < Size && isTypeLegal(VT)) {
4154 unsigned IsFast;
4155 unsigned AS = LN->getAddressSpace();
4156
4157 // Expand unaligned loads earlier than legalization. Due to visitation order
4158 // problems during legalization, the emitted instructions to pack and unpack
4159 // the bytes again are not eliminated in the case of an unaligned copy.
4161 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
4162 if (VT.isVector())
4163 return SplitVectorLoad(SDValue(LN, 0), DAG);
4164
4165 SDValue Ops[2];
4166 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
4167
4168 return DAG.getMergeValues(Ops, SDLoc(N));
4169 }
4170
4171 if (!IsFast)
4172 return SDValue();
4173 }
4174
4175 if (!shouldCombineMemoryType(VT))
4176 return SDValue();
4177
4178 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4179
4180 SDValue NewLoad
4181 = DAG.getLoad(NewVT, SL, LN->getChain(),
4182 LN->getBasePtr(), LN->getMemOperand());
4183
4184 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
4185 DCI.CombineTo(N, BC, NewLoad.getValue(1));
4186 return SDValue(N, 0);
4187}
4188
4189// Replace store of an illegal type with a store of a bitcast to a friendlier
4190// type.
4192 DAGCombinerInfo &DCI) const {
4193 if (!DCI.isBeforeLegalize())
4194 return SDValue();
4195
4197 if (!SN->isSimple() || !ISD::isNormalStore(SN))
4198 return SDValue();
4199
4200 EVT VT = SN->getMemoryVT();
4201 unsigned Size = VT.getStoreSize();
4202
4203 SDLoc SL(N);
4204 SelectionDAG &DAG = DCI.DAG;
4205 Align Alignment = SN->getAlign();
4206 if (Alignment < Size && isTypeLegal(VT)) {
4207 unsigned IsFast;
4208 unsigned AS = SN->getAddressSpace();
4209
4210 // Expand unaligned stores earlier than legalization. Due to visitation
4211 // order problems during legalization, the emitted instructions to pack and
4212 // unpack the bytes again are not eliminated in the case of an unaligned
4213 // copy.
4215 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
4216 if (VT.isVector())
4217 return SplitVectorStore(SDValue(SN, 0), DAG);
4218
4219 return expandUnalignedStore(SN, DAG);
4220 }
4221
4222 if (!IsFast)
4223 return SDValue();
4224 }
4225
4226 if (!shouldCombineMemoryType(VT))
4227 return SDValue();
4228
4229 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4230 SDValue Val = SN->getValue();
4231
4232 // DCI.AddToWorklist(Val.getNode());
4233
4234 bool OtherUses = !Val.hasOneUse();
4235 SDValue CastVal = DAG.getBitcast(NewVT, Val);
4236 if (OtherUses) {
4237 SDValue CastBack = DAG.getBitcast(VT, CastVal);
4238 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4239 }
4240
4241 return DAG.getStore(SN->getChain(), SL, CastVal,
4242 SN->getBasePtr(), SN->getMemOperand());
4243}
4244
4245// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4246// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4247// issues.
4249 DAGCombinerInfo &DCI) const {
4250 SelectionDAG &DAG = DCI.DAG;
4251 SDValue N0 = N->getOperand(0);
4252
4253 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4254 // (vt2 (truncate (assertzext vt0:x, vt1)))
4255 if (N0.getOpcode() == ISD::TRUNCATE) {
4256 SDValue N1 = N->getOperand(1);
4257 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4258 SDLoc SL(N);
4259
4260 SDValue Src = N0.getOperand(0);
4261 EVT SrcVT = Src.getValueType();
4262 if (SrcVT.bitsGE(ExtVT)) {
4263 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4264 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4265 }
4266 }
4267
4268 return SDValue();
4269}
4270
4272 SDNode *N, DAGCombinerInfo &DCI) const {
4273 unsigned IID = N->getConstantOperandVal(0);
4274 switch (IID) {
4275 case Intrinsic::amdgcn_mul_i24:
4276 case Intrinsic::amdgcn_mul_u24:
4277 case Intrinsic::amdgcn_mulhi_i24:
4278 case Intrinsic::amdgcn_mulhi_u24:
4279 return simplifyMul24(N, DCI);
4280 case Intrinsic::amdgcn_fract:
4281 case Intrinsic::amdgcn_rsq:
4282 case Intrinsic::amdgcn_rcp_legacy:
4283 case Intrinsic::amdgcn_rsq_legacy:
4284 case Intrinsic::amdgcn_rsq_clamp:
4285 case Intrinsic::amdgcn_tanh:
4286 case Intrinsic::amdgcn_prng_b32: {
4287 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4288 SDValue Src = N->getOperand(1);
4289 return Src.isUndef() ? Src : SDValue();
4290 }
4291 case Intrinsic::amdgcn_frexp_exp: {
4292 // frexp_exp (fneg x) -> frexp_exp x
4293 // frexp_exp (fabs x) -> frexp_exp x
4294 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4295 SDValue Src = N->getOperand(1);
4296 SDValue PeekSign = peekFPSignOps(Src);
4297 if (PeekSign == Src)
4298 return SDValue();
4299 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4300 0);
4301 }
4302 default:
4303 return SDValue();
4304 }
4305}
4306
4307/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4308/// binary operation \p Opc to it with the corresponding constant operands.
4310 DAGCombinerInfo &DCI, const SDLoc &SL,
4311 unsigned Opc, SDValue LHS,
4312 uint32_t ValLo, uint32_t ValHi) const {
4313 SelectionDAG &DAG = DCI.DAG;
4314 SDValue Lo, Hi;
4315 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4316
4317 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4318 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4319
4320 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4321 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4322
4323 // Re-visit the ands. It's possible we eliminated one of them and it could
4324 // simplify the vector.
4325 DCI.AddToWorklist(Lo.getNode());
4326 DCI.AddToWorklist(Hi.getNode());
4327
4328 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4329 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4330}
4331
4333 DAGCombinerInfo &DCI) const {
4334 EVT VT = N->getValueType(0);
4335 SDValue LHS = N->getOperand(0);
4336 SDValue RHS = N->getOperand(1);
4338 SDLoc SL(N);
4339 SelectionDAG &DAG = DCI.DAG;
4340
4341 unsigned RHSVal;
4342 if (CRHS) {
4343 RHSVal = CRHS->getZExtValue();
4344 if (!RHSVal)
4345 return LHS;
4346
4347 switch (LHS->getOpcode()) {
4348 default:
4349 break;
4350 case ISD::ZERO_EXTEND:
4351 case ISD::SIGN_EXTEND:
4352 case ISD::ANY_EXTEND: {
4353 SDValue X = LHS->getOperand(0);
4354
4355 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4356 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4357 // Prefer build_vector as the canonical form if packed types are legal.
4358 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4359 SDValue Vec = DAG.getBuildVector(
4360 MVT::v2i16, SL,
4361 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4362 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4363 }
4364
4365 // shl (ext x) => zext (shl x), if shift does not overflow int
4366 if (VT != MVT::i64)
4367 break;
4368 KnownBits Known = DAG.computeKnownBits(X);
4369 unsigned LZ = Known.countMinLeadingZeros();
4370 if (LZ < RHSVal)
4371 break;
4372 EVT XVT = X.getValueType();
4373 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4374 return DAG.getZExtOrTrunc(Shl, SL, VT);
4375 }
4376 }
4377 }
4378
4379 if (VT.getScalarType() != MVT::i64)
4380 return SDValue();
4381
4382 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4383 // common case, splitting this into a move and a 32-bit shift is faster and
4384 // the same code size.
4385 KnownBits Known = DAG.computeKnownBits(RHS);
4386
4387 EVT ElementType = VT.getScalarType();
4388 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4389 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4390
4391 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4392 return SDValue();
4393 SDValue ShiftAmt;
4394
4395 if (CRHS) {
4396 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4397 TargetType);
4398 } else {
4399 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4400 const SDValue ShiftMask =
4401 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4402 // This AND instruction will clamp out of bounds shift values.
4403 // It will also be removed during later instruction selection.
4404 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4405 }
4406
4407 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4408 SDValue NewShift =
4409 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4410
4411 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4412 SDValue Vec;
4413
4414 if (VT.isVector()) {
4415 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4416 unsigned NElts = TargetType.getVectorNumElements();
4418 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4419
4420 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4421 for (unsigned I = 0; I != NElts; ++I)
4422 HiAndLoOps[2 * I + 1] = HiOps[I];
4423 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4424 } else {
4425 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4426 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4427 }
4428 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4429}
4430
4432 DAGCombinerInfo &DCI) const {
4433 SDValue RHS = N->getOperand(1);
4435 EVT VT = N->getValueType(0);
4436 SDValue LHS = N->getOperand(0);
4437 SelectionDAG &DAG = DCI.DAG;
4438 SDLoc SL(N);
4439
4440 if (VT.getScalarType() != MVT::i64)
4441 return SDValue();
4442
4443 // For C >= 32
4444 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4445
4446 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4447 // common case, splitting this into a move and a 32-bit shift is faster and
4448 // the same code size.
4449 KnownBits Known = DAG.computeKnownBits(RHS);
4450
4451 EVT ElementType = VT.getScalarType();
4452 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4453 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4454
4455 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4456 return SDValue();
4457
4458 SDValue ShiftFullAmt =
4459 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4460 SDValue ShiftAmt;
4461 if (CRHS) {
4462 unsigned RHSVal = CRHS->getZExtValue();
4463 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4464 TargetType);
4465 } else if (Known.getMinValue().getZExtValue() ==
4466 (ElementType.getSizeInBits() - 1)) {
4467 ShiftAmt = ShiftFullAmt;
4468 } else {
4469 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4470 const SDValue ShiftMask =
4471 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4472 // This AND instruction will clamp out of bounds shift values.
4473 // It will also be removed during later instruction selection.
4474 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4475 }
4476
4477 EVT ConcatType;
4478 SDValue Hi;
4479 SDLoc LHSSL(LHS);
4480 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4481 if (VT.isVector()) {
4482 unsigned NElts = TargetType.getVectorNumElements();
4483 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4484 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4485 SmallVector<SDValue, 8> HiOps(NElts);
4486 SmallVector<SDValue, 16> HiAndLoOps;
4487
4488 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4489 for (unsigned I = 0; I != NElts; ++I) {
4490 HiOps[I] = HiAndLoOps[2 * I + 1];
4491 }
4492 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4493 } else {
4494 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4495 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4496 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4497 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4498 }
4499
4500 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4501 SDValue NewShift, HiShift;
4502 if (KnownLHS.isNegative()) {
4503 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4504 NewShift =
4505 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4506 } else if (CRHS &&
4507 CRHS->getZExtValue() == (ElementType.getSizeInBits() - 1)) {
4508 NewShift = HiShift =
4509 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4510 } else {
4511 Hi = DAG.getFreeze(Hi);
4512 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4513 NewShift =
4514 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4515 }
4516
4517 SDValue Vec;
4518 if (VT.isVector()) {
4519 unsigned NElts = TargetType.getVectorNumElements();
4522 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4523
4524 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4525 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4526 for (unsigned I = 0; I != NElts; ++I) {
4527 HiAndLoOps[2 * I + 1] = HiOps[I];
4528 HiAndLoOps[2 * I] = LoOps[I];
4529 }
4530 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4531 } else {
4532 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4533 }
4534 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4535}
4536
4538 DAGCombinerInfo &DCI) const {
4539 SDValue RHS = N->getOperand(1);
4541 EVT VT = N->getValueType(0);
4542 SDValue LHS = N->getOperand(0);
4543 SelectionDAG &DAG = DCI.DAG;
4544 SDLoc SL(N);
4545 unsigned RHSVal;
4546
4547 if (CRHS) {
4548 RHSVal = CRHS->getZExtValue();
4549
4550 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4551 // this improves the ability to match BFE patterns in isel.
4552 if (LHS.getOpcode() == ISD::AND) {
4553 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4554 unsigned MaskIdx, MaskLen;
4555 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4556 MaskIdx == RHSVal) {
4557 return DAG.getNode(ISD::AND, SL, VT,
4558 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4559 N->getOperand(1)),
4560 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4561 N->getOperand(1)));
4562 }
4563 }
4564 }
4565 }
4566
4567 if (VT.getScalarType() != MVT::i64)
4568 return SDValue();
4569
4570 // for C >= 32
4571 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4572
4573 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4574 // common case, splitting this into a move and a 32-bit shift is faster and
4575 // the same code size.
4576 KnownBits Known = DAG.computeKnownBits(RHS);
4577
4578 EVT ElementType = VT.getScalarType();
4579 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4580 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4581
4582 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4583 return SDValue();
4584
4585 SDValue ShiftAmt;
4586 if (CRHS) {
4587 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4588 TargetType);
4589 } else {
4590 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4591 const SDValue ShiftMask =
4592 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4593 // This AND instruction will clamp out of bounds shift values.
4594 // It will also be removed during later instruction selection.
4595 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4596 }
4597
4598 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4599 EVT ConcatType;
4600 SDValue Hi;
4601 SDLoc LHSSL(LHS);
4602 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4603 if (VT.isVector()) {
4604 unsigned NElts = TargetType.getVectorNumElements();
4605 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4606 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4607 SmallVector<SDValue, 8> HiOps(NElts);
4608 SmallVector<SDValue, 16> HiAndLoOps;
4609
4610 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4611 for (unsigned I = 0; I != NElts; ++I)
4612 HiOps[I] = HiAndLoOps[2 * I + 1];
4613 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4614 } else {
4615 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4616 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4617 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4618 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4619 }
4620
4621 SDValue NewShift =
4622 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4623
4624 SDValue Vec;
4625 if (VT.isVector()) {
4626 unsigned NElts = TargetType.getVectorNumElements();
4628 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4629
4630 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4631 for (unsigned I = 0; I != NElts; ++I)
4632 HiAndLoOps[2 * I] = LoOps[I];
4633 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4634 } else {
4635 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4636 }
4637 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4638}
4639
4641 SDNode *N, DAGCombinerInfo &DCI) const {
4642 SDLoc SL(N);
4643 SelectionDAG &DAG = DCI.DAG;
4644 EVT VT = N->getValueType(0);
4645 SDValue Src = N->getOperand(0);
4646
4647 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4648 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4649 SDValue Vec = Src.getOperand(0);
4650 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4651 SDValue Elt0 = Vec.getOperand(0);
4652 EVT EltVT = Elt0.getValueType();
4653 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4654 if (EltVT.isFloatingPoint()) {
4655 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4656 EltVT.changeTypeToInteger(), Elt0);
4657 }
4658
4659 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4660 }
4661 }
4662 }
4663
4664 // Equivalent of above for accessing the high element of a vector as an
4665 // integer operation.
4666 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4667 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4668 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4669 SDValue BV = stripBitcast(Src.getOperand(0));
4670 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4671 EVT SrcEltVT = BV.getOperand(0).getValueType();
4672 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4673 unsigned BitIndex = K->getZExtValue();
4674 unsigned PartIndex = BitIndex / SrcEltSize;
4675
4676 if (PartIndex * SrcEltSize == BitIndex &&
4677 PartIndex < BV.getNumOperands()) {
4678 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4679 SDValue SrcElt =
4680 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4681 BV.getOperand(PartIndex));
4682 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4683 }
4684 }
4685 }
4686 }
4687 }
4688
4689 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4690 //
4691 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4692 // i16 (trunc (srl (i32 (trunc x), K)))
4693 if (VT.getScalarSizeInBits() < 32) {
4694 EVT SrcVT = Src.getValueType();
4695 if (SrcVT.getScalarSizeInBits() > 32 &&
4696 (Src.getOpcode() == ISD::SRL ||
4697 Src.getOpcode() == ISD::SRA ||
4698 Src.getOpcode() == ISD::SHL)) {
4699 SDValue Amt = Src.getOperand(1);
4700 KnownBits Known = DAG.computeKnownBits(Amt);
4701
4702 // - For left shifts, do the transform as long as the shift
4703 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4704 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4705 // losing information stored in the high bits when truncating.
4706 const unsigned MaxCstSize =
4707 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4708 if (Known.getMaxValue().ule(MaxCstSize)) {
4709 EVT MidVT = VT.isVector() ?
4710 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4711 VT.getVectorNumElements()) : MVT::i32;
4712
4713 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4714 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4715 Src.getOperand(0));
4716 DCI.AddToWorklist(Trunc.getNode());
4717
4718 if (Amt.getValueType() != NewShiftVT) {
4719 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4720 DCI.AddToWorklist(Amt.getNode());
4721 }
4722
4723 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4724 Trunc, Amt);
4725 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4726 }
4727 }
4728 }
4729
4730 return SDValue();
4731}
4732
4733// We need to specifically handle i64 mul here to avoid unnecessary conversion
4734// instructions. If we only match on the legalized i64 mul expansion,
4735// SimplifyDemandedBits will be unable to remove them because there will be
4736// multiple uses due to the separate mul + mulh[su].
4737static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4738 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4739 if (Size <= 32) {
4740 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4741 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4742 }
4743
4744 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4745 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4746
4747 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4748 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4749
4750 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4751}
4752
4753/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4754/// return SDValue().
4755static SDValue getAddOneOp(const SDNode *V) {
4756 if (V->getOpcode() != ISD::ADD)
4757 return SDValue();
4758
4759 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4760}
4761
4763 DAGCombinerInfo &DCI) const {
4764 assert(N->getOpcode() == ISD::MUL);
4765 EVT VT = N->getValueType(0);
4766
4767 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4768 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4769 // unnecessarily). isDivergent() is used as an approximation of whether the
4770 // value is in an SGPR.
4771 if (!N->isDivergent())
4772 return SDValue();
4773
4774 unsigned Size = VT.getSizeInBits();
4775 if (VT.isVector() || Size > 64)
4776 return SDValue();
4777
4778 SelectionDAG &DAG = DCI.DAG;
4779 SDLoc DL(N);
4780
4781 SDValue N0 = N->getOperand(0);
4782 SDValue N1 = N->getOperand(1);
4783
4784 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4785 // matching.
4786
4787 // mul x, (add y, 1) -> add (mul x, y), x
4788 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4789 SDValue AddOp = getAddOneOp(V.getNode());
4790 if (!AddOp)
4791 return SDValue();
4792
4793 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4794 return U->getOpcode() == ISD::MUL;
4795 }))
4796 return AddOp;
4797
4798 return SDValue();
4799 };
4800
4801 // FIXME: The selection pattern is not properly checking for commuted
4802 // operands, so we have to place the mul in the LHS
4803 if (SDValue MulOper = IsFoldableAdd(N0)) {
4804 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4805 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4806 }
4807
4808 if (SDValue MulOper = IsFoldableAdd(N1)) {
4809 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4810 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4811 }
4812
4813 // There are i16 integer mul/mad.
4814 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4815 return SDValue();
4816
4817 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4818 // in the source into any_extends if the result of the mul is truncated. Since
4819 // we can assume the high bits are whatever we want, use the underlying value
4820 // to avoid the unknown high bits from interfering.
4821 if (N0.getOpcode() == ISD::ANY_EXTEND)
4822 N0 = N0.getOperand(0);
4823
4824 if (N1.getOpcode() == ISD::ANY_EXTEND)
4825 N1 = N1.getOperand(0);
4826
4827 SDValue Mul;
4828
4829 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4830 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4831 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4832 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4833 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4834 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4835 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4836 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4837 } else {
4838 return SDValue();
4839 }
4840
4841 // We need to use sext even for MUL_U24, because MUL_U24 is used
4842 // for signed multiply of 8 and 16-bit types.
4843 return DAG.getSExtOrTrunc(Mul, DL, VT);
4844}
4845
4846SDValue
4848 DAGCombinerInfo &DCI) const {
4849 if (N->getValueType(0) != MVT::i32)
4850 return SDValue();
4851
4852 SelectionDAG &DAG = DCI.DAG;
4853 SDLoc DL(N);
4854
4855 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4856 SDValue N0 = N->getOperand(0);
4857 SDValue N1 = N->getOperand(1);
4858
4859 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4860 // in the source into any_extends if the result of the mul is truncated. Since
4861 // we can assume the high bits are whatever we want, use the underlying value
4862 // to avoid the unknown high bits from interfering.
4863 if (N0.getOpcode() == ISD::ANY_EXTEND)
4864 N0 = N0.getOperand(0);
4865 if (N1.getOpcode() == ISD::ANY_EXTEND)
4866 N1 = N1.getOperand(0);
4867
4868 // Try to use two fast 24-bit multiplies (one for each half of the result)
4869 // instead of one slow extending multiply.
4870 unsigned LoOpcode = 0;
4871 unsigned HiOpcode = 0;
4872 if (Signed) {
4873 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4874 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4875 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4876 LoOpcode = AMDGPUISD::MUL_I24;
4877 HiOpcode = AMDGPUISD::MULHI_I24;
4878 }
4879 } else {
4880 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4881 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4882 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4883 LoOpcode = AMDGPUISD::MUL_U24;
4884 HiOpcode = AMDGPUISD::MULHI_U24;
4885 }
4886 }
4887 if (!LoOpcode)
4888 return SDValue();
4889
4890 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4891 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4892 DCI.CombineTo(N, Lo, Hi);
4893 return SDValue(N, 0);
4894}
4895
4897 DAGCombinerInfo &DCI) const {
4898 EVT VT = N->getValueType(0);
4899
4900 if (!Subtarget->hasMulI24() || VT.isVector())
4901 return SDValue();
4902
4903 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4904 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4905 // unnecessarily). isDivergent() is used as an approximation of whether the
4906 // value is in an SGPR.
4907 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4908 // valu op anyway)
4909 if (Subtarget->hasSMulHi() && !N->isDivergent())
4910 return SDValue();
4911
4912 SelectionDAG &DAG = DCI.DAG;
4913 SDLoc DL(N);
4914
4915 SDValue N0 = N->getOperand(0);
4916 SDValue N1 = N->getOperand(1);
4917
4918 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4919 return SDValue();
4920
4921 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4922 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4923
4924 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4925 DCI.AddToWorklist(Mulhi.getNode());
4926 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4927}
4928
4930 DAGCombinerInfo &DCI) const {
4931 EVT VT = N->getValueType(0);
4932
4933 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4934 return SDValue();
4935
4936 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4937 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4938 // unnecessarily). isDivergent() is used as an approximation of whether the
4939 // value is in an SGPR.
4940 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4941 // valu op anyway)
4942 if (!N->isDivergent() && Subtarget->hasSMulHi())
4943 return SDValue();
4944
4945 SelectionDAG &DAG = DCI.DAG;
4946 SDLoc DL(N);
4947
4948 SDValue N0 = N->getOperand(0);
4949 SDValue N1 = N->getOperand(1);
4950
4951 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4952 return SDValue();
4953
4954 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4955 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4956
4957 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4958 DCI.AddToWorklist(Mulhi.getNode());
4959 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4960}
4961
4962SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4963 SDValue Op,
4964 const SDLoc &DL,
4965 unsigned Opc) const {
4966 EVT VT = Op.getValueType();
4967 if (VT.bitsGT(MVT::i32))
4968 return SDValue();
4969
4970 if (VT != MVT::i32)
4971 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4972
4973 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4974 if (VT != MVT::i32)
4975 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4976
4977 return FFBX;
4978}
4979
4980// The native instructions return -1 on 0 input. Optimize out a select that
4981// produces -1 on 0.
4982//
4983// TODO: If zero is not undef, we could also do this if the output is compared
4984// against the bitwidth.
4985//
4986// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4988 SDValue LHS, SDValue RHS,
4989 DAGCombinerInfo &DCI) const {
4990 if (!isNullConstant(Cond.getOperand(1)))
4991 return SDValue();
4992
4993 SelectionDAG &DAG = DCI.DAG;
4994 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4995 SDValue CmpLHS = Cond.getOperand(0);
4996
4997 // select (setcc x, 0, eq), -1, (ctlz_zero_poison x) -> ffbh_u32 x
4998 // select (setcc x, 0, eq), -1, (cttz_zero_poison x) -> ffbl_u32 x
4999 if (CCOpcode == ISD::SETEQ &&
5000 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
5001 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
5002 unsigned Opc =
5003 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5004 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5005 }
5006
5007 // select (setcc x, 0, ne), (ctlz_zero_poison x), -1 -> ffbh_u32 x
5008 // select (setcc x, 0, ne), (cttz_zero_poison x), -1 -> ffbl_u32 x
5009 if (CCOpcode == ISD::SETNE &&
5010 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
5011 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
5012 unsigned Opc =
5013 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5014
5015 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5016 }
5017
5018 return SDValue();
5019}
5020
5022 unsigned Op,
5023 const SDLoc &SL,
5024 SDValue Cond,
5025 SDValue N1,
5026 SDValue N2) {
5027 SelectionDAG &DAG = DCI.DAG;
5028 EVT VT = N1.getValueType();
5029
5030 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
5031 N1.getOperand(0), N2.getOperand(0));
5032 DCI.AddToWorklist(NewSelect.getNode());
5033 return DAG.getNode(Op, SL, VT, NewSelect);
5034}
5035
5036// Pull a free FP operation out of a select so it may fold into uses.
5037//
5038// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
5039// select c, (fneg x), k -> fneg (select c, x, (fneg k))
5040//
5041// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5042// select c, (fabs x), +k -> fabs (select c, x, k)
5043SDValue
5045 SDValue N) const {
5046 SelectionDAG &DAG = DCI.DAG;
5047 SDValue Cond = N.getOperand(0);
5048 SDValue LHS = N.getOperand(1);
5049 SDValue RHS = N.getOperand(2);
5050
5051 EVT VT = N.getValueType();
5052 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5053 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5055 return SDValue();
5056
5057 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
5058 SDLoc(N), Cond, LHS, RHS);
5059 }
5060
5061 bool Inv = false;
5062 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5063 std::swap(LHS, RHS);
5064 Inv = true;
5065 }
5066
5067 // TODO: Support vector constants.
5069 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5070 !selectSupportsSourceMods(N.getNode())) {
5071 SDLoc SL(N);
5072 // If one side is an fneg/fabs and the other is a constant, we can push the
5073 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5074 SDValue NewLHS = LHS.getOperand(0);
5075 SDValue NewRHS = RHS;
5076
5077 // Careful: if the neg can be folded up, don't try to pull it back down.
5078 bool ShouldFoldNeg = true;
5079
5080 if (NewLHS.hasOneUse()) {
5081 unsigned Opc = NewLHS.getOpcode();
5082 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
5083 ShouldFoldNeg = false;
5084 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5085 ShouldFoldNeg = false;
5086 }
5087
5088 if (ShouldFoldNeg) {
5089 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5090 return SDValue();
5091
5092 // We're going to be forced to use a source modifier anyway, there's no
5093 // point to pulling the negate out unless we can get a size reduction by
5094 // negating the constant.
5095 //
5096 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5097 // about cheaper constants.
5098 if (NewLHS.getOpcode() == ISD::FABS &&
5100 return SDValue();
5101
5103 return SDValue();
5104
5105 if (LHS.getOpcode() == ISD::FNEG)
5106 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5107
5108 if (Inv)
5109 std::swap(NewLHS, NewRHS);
5110
5111 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
5112 Cond, NewLHS, NewRHS);
5113 DCI.AddToWorklist(NewSelect.getNode());
5114 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
5115 }
5116 }
5117
5118 return SDValue();
5119}
5120
5122 DAGCombinerInfo &DCI) const {
5123 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
5124 return Folded;
5125
5126 SDValue Cond = N->getOperand(0);
5127 if (Cond.getOpcode() != ISD::SETCC)
5128 return SDValue();
5129
5130 EVT VT = N->getValueType(0);
5131 SDValue LHS = Cond.getOperand(0);
5132 SDValue RHS = Cond.getOperand(1);
5133 SDValue CC = Cond.getOperand(2);
5134
5135 SDValue True = N->getOperand(1);
5136 SDValue False = N->getOperand(2);
5137
5138 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5139 SelectionDAG &DAG = DCI.DAG;
5140 if (DAG.isConstantValueOfAnyType(True) &&
5141 !DAG.isConstantValueOfAnyType(False)) {
5142 // Swap cmp + select pair to move constant to false input.
5143 // This will allow using VOPC cndmasks more often.
5144 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5145
5146 SDLoc SL(N);
5147 ISD::CondCode NewCC =
5148 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
5149
5150 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
5151 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
5152 }
5153
5154 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5156 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5157 // Revisit this node so we can catch min3/max3/med3 patterns.
5158 //DCI.AddToWorklist(MinMax.getNode());
5159 return MinMax;
5160 }
5161 }
5162
5163 // There's no reason to not do this if the condition has other uses.
5164 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
5165}
5166
5167static bool isInv2Pi(const APFloat &APF) {
5168 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5169 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5170 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5171
5172 return APF.bitwiseIsEqual(KF16) ||
5173 APF.bitwiseIsEqual(KF32) ||
5174 APF.bitwiseIsEqual(KF64);
5175}
5176
5177// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5178// additional cost to negate them.
5181 if (C->isZero())
5182 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5183
5184 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
5185 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5186
5188}
5189
5195
5201
5202static unsigned inverseMinMax(unsigned Opc) {
5203 switch (Opc) {
5204 case ISD::FMAXNUM:
5205 return ISD::FMINNUM;
5206 case ISD::FMINNUM:
5207 return ISD::FMAXNUM;
5208 case ISD::FMAXNUM_IEEE:
5209 return ISD::FMINNUM_IEEE;
5210 case ISD::FMINNUM_IEEE:
5211 return ISD::FMAXNUM_IEEE;
5212 case ISD::FMAXIMUM:
5213 return ISD::FMINIMUM;
5214 case ISD::FMINIMUM:
5215 return ISD::FMAXIMUM;
5216 case ISD::FMAXIMUMNUM:
5217 return ISD::FMINIMUMNUM;
5218 case ISD::FMINIMUMNUM:
5219 return ISD::FMAXIMUMNUM;
5220 case AMDGPUISD::FMAX_LEGACY:
5221 return AMDGPUISD::FMIN_LEGACY;
5222 case AMDGPUISD::FMIN_LEGACY:
5223 return AMDGPUISD::FMAX_LEGACY;
5224 default:
5225 llvm_unreachable("invalid min/max opcode");
5226 }
5227}
5228
5229/// \return true if it's profitable to try to push an fneg into its source
5230/// instruction.
5232 // If the input has multiple uses and we can either fold the negate down, or
5233 // the other uses cannot, give up. This both prevents unprofitable
5234 // transformations and infinite loops: we won't repeatedly try to fold around
5235 // a negate that has no 'good' form.
5236 if (N0.hasOneUse()) {
5237 // This may be able to fold into the source, but at a code size cost. Don't
5238 // fold if the fold into the user is free.
5239 if (allUsesHaveSourceMods(N, 0))
5240 return false;
5241 } else {
5242 if (fnegFoldsIntoOp(N0.getNode()) &&
5244 return false;
5245 }
5246
5247 return true;
5248}
5249
5251 DAGCombinerInfo &DCI) const {
5252 SelectionDAG &DAG = DCI.DAG;
5253 SDValue N0 = N->getOperand(0);
5254 EVT VT = N->getValueType(0);
5255
5256 unsigned Opc = N0.getOpcode();
5257
5258 if (!shouldFoldFNegIntoSrc(N, N0))
5259 return SDValue();
5260
5261 SDLoc SL(N);
5262 switch (Opc) {
5263 case ISD::FADD: {
5264 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5265 return SDValue();
5266
5267 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5268 SDValue LHS = N0.getOperand(0);
5269 SDValue RHS = N0.getOperand(1);
5270
5271 if (LHS.getOpcode() != ISD::FNEG)
5272 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5273 else
5274 LHS = LHS.getOperand(0);
5275
5276 if (RHS.getOpcode() != ISD::FNEG)
5277 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5278 else
5279 RHS = RHS.getOperand(0);
5280
5281 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5282 if (Res.getOpcode() != ISD::FADD)
5283 return SDValue(); // Op got folded away.
5284 if (!N0.hasOneUse())
5285 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5286 return Res;
5287 }
5288 case ISD::FMUL:
5289 case AMDGPUISD::FMUL_LEGACY: {
5290 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5291 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5292 SDValue LHS = N0.getOperand(0);
5293 SDValue RHS = N0.getOperand(1);
5294
5295 if (LHS.getOpcode() == ISD::FNEG)
5296 LHS = LHS.getOperand(0);
5297 else if (RHS.getOpcode() == ISD::FNEG)
5298 RHS = RHS.getOperand(0);
5299 else
5300 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5301
5302 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5303 if (Res.getOpcode() != Opc)
5304 return SDValue(); // Op got folded away.
5305 if (!N0.hasOneUse())
5306 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5307 return Res;
5308 }
5309 case ISD::FMA:
5310 case ISD::FMAD: {
5311 // TODO: handle llvm.amdgcn.fma.legacy
5312 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5313 return SDValue();
5314
5315 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5316 SDValue LHS = N0.getOperand(0);
5317 SDValue MHS = N0.getOperand(1);
5318 SDValue RHS = N0.getOperand(2);
5319
5320 if (LHS.getOpcode() == ISD::FNEG)
5321 LHS = LHS.getOperand(0);
5322 else if (MHS.getOpcode() == ISD::FNEG)
5323 MHS = MHS.getOperand(0);
5324 else
5325 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5326
5327 if (RHS.getOpcode() != ISD::FNEG)
5328 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5329 else
5330 RHS = RHS.getOperand(0);
5331
5332 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5333 if (Res.getOpcode() != Opc)
5334 return SDValue(); // Op got folded away.
5335 if (!N0.hasOneUse())
5336 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5337 return Res;
5338 }
5339 case ISD::FMAXNUM:
5340 case ISD::FMINNUM:
5341 case ISD::FMAXNUM_IEEE:
5342 case ISD::FMINNUM_IEEE:
5343 case ISD::FMINIMUM:
5344 case ISD::FMAXIMUM:
5345 case ISD::FMINIMUMNUM:
5346 case ISD::FMAXIMUMNUM:
5347 case AMDGPUISD::FMAX_LEGACY:
5348 case AMDGPUISD::FMIN_LEGACY: {
5349 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5350 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5351 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5352 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5353
5354 SDValue LHS = N0.getOperand(0);
5355 SDValue RHS = N0.getOperand(1);
5356
5357 // 0 doesn't have a negated inline immediate.
5358 // TODO: This constant check should be generalized to other operations.
5360 return SDValue();
5361
5362 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5363 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5364 unsigned Opposite = inverseMinMax(Opc);
5365
5366 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5367 if (Res.getOpcode() != Opposite)
5368 return SDValue(); // Op got folded away.
5369 if (!N0.hasOneUse())
5370 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5371 return Res;
5372 }
5373 case AMDGPUISD::FMED3: {
5374 SDValue Ops[3];
5375 for (unsigned I = 0; I < 3; ++I)
5376 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5377
5378 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5379 if (Res.getOpcode() != AMDGPUISD::FMED3)
5380 return SDValue(); // Op got folded away.
5381
5382 if (!N0.hasOneUse()) {
5383 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5384 DAG.ReplaceAllUsesWith(N0, Neg);
5385
5386 for (SDNode *U : Neg->users())
5387 DCI.AddToWorklist(U);
5388 }
5389
5390 return Res;
5391 }
5392 case ISD::FP_EXTEND:
5393 case ISD::FTRUNC:
5394 case ISD::FRINT:
5395 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5396 case ISD::FROUNDEVEN:
5397 case ISD::FSIN:
5398 case ISD::FCANONICALIZE:
5399 case AMDGPUISD::RCP:
5400 case AMDGPUISD::RCP_LEGACY:
5401 case AMDGPUISD::RCP_IFLAG:
5402 case AMDGPUISD::SIN_HW: {
5403 SDValue CvtSrc = N0.getOperand(0);
5404 if (CvtSrc.getOpcode() == ISD::FNEG) {
5405 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5406 // (fneg (rcp (fneg x))) -> (rcp x)
5407 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5408 }
5409
5410 if (!N0.hasOneUse())
5411 return SDValue();
5412
5413 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5414 // (fneg (rcp x)) -> (rcp (fneg x))
5415 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5416 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5417 }
5418 case ISD::FP_ROUND: {
5419 SDValue CvtSrc = N0.getOperand(0);
5420
5421 if (CvtSrc.getOpcode() == ISD::FNEG) {
5422 // (fneg (fp_round (fneg x))) -> (fp_round x)
5423 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5424 CvtSrc.getOperand(0), N0.getOperand(1));
5425 }
5426
5427 if (!N0.hasOneUse())
5428 return SDValue();
5429
5430 // (fneg (fp_round x)) -> (fp_round (fneg x))
5431 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5432 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5433 }
5434 case ISD::FP16_TO_FP: {
5435 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5436 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5437 // Put the fneg back as a legal source operation that can be matched later.
5438 SDLoc SL(N);
5439
5440 SDValue Src = N0.getOperand(0);
5441 EVT SrcVT = Src.getValueType();
5442
5443 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5444 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5445 DAG.getConstant(0x8000, SL, SrcVT));
5446 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5447 }
5448 case ISD::SELECT: {
5449 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5450 // TODO: Invert conditions of foldFreeOpFromSelect
5451 return SDValue();
5452 }
5453 case ISD::BITCAST: {
5454 SDLoc SL(N);
5455 SDValue BCSrc = N0.getOperand(0);
5456 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5457 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5458 if (HighBits.getValueType().getSizeInBits() != 32 ||
5459 !fnegFoldsIntoOp(HighBits.getNode()))
5460 return SDValue();
5461
5462 // f64 fneg only really needs to operate on the high half of of the
5463 // register, so try to force it to an f32 operation to help make use of
5464 // source modifiers.
5465 //
5466 //
5467 // fneg (f64 (bitcast (build_vector x, y))) ->
5468 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5469 // (fneg (bitcast i32:y to f32)))
5470
5471 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5472 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5473 SDValue CastBack =
5474 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5475
5477 Ops.back() = CastBack;
5478 DCI.AddToWorklist(NegHi.getNode());
5479 SDValue Build =
5480 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5481 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5482
5483 if (!N0.hasOneUse())
5484 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5485 return Result;
5486 }
5487
5488 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5489 BCSrc.hasOneUse()) {
5490 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5491 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5492
5493 // TODO: Cast back result for multiple uses is beneficial in some cases.
5494
5495 SDValue LHS =
5496 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5497 SDValue RHS =
5498 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5499
5500 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5501 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5502
5503 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5504 NegRHS);
5505 }
5506
5507 return SDValue();
5508 }
5509 default:
5510 return SDValue();
5511 }
5512}
5513
5515 DAGCombinerInfo &DCI) const {
5516 SelectionDAG &DAG = DCI.DAG;
5517 SDValue N0 = N->getOperand(0);
5518
5519 if (!N0.hasOneUse())
5520 return SDValue();
5521
5522 switch (N0.getOpcode()) {
5523 case ISD::FP16_TO_FP: {
5524 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5525 SDLoc SL(N);
5526 SDValue Src = N0.getOperand(0);
5527 EVT SrcVT = Src.getValueType();
5528
5529 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5530 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5531 DAG.getConstant(0x7fff, SL, SrcVT));
5532 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5533 }
5534 default:
5535 return SDValue();
5536 }
5537}
5538
5540 DAGCombinerInfo &DCI) const {
5541 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5542 if (!CFP)
5543 return SDValue();
5544
5545 // XXX - Should this flush denormals?
5546 const APFloat &Val = CFP->getValueAPF();
5547 APFloat One(Val.getSemantics(), "1.0");
5548 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5549}
5550
5552 if (!Subtarget->isGCN())
5553 return false;
5554
5557 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5558 const auto *TII = ST.getInstrInfo();
5559
5560 if (!ST.hasVMovB64Inst() || (!SDConstant && !SDFPConstant))
5561 return false;
5562
5563 if (ST.has64BitLiterals())
5564 return true;
5565
5566 if (SDConstant) {
5567 const APInt &APVal = SDConstant->getAPIntValue();
5568 return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal);
5569 }
5570
5571 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5572 return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val);
5573}
5574
5576 DAGCombinerInfo &DCI) const {
5577 SelectionDAG &DAG = DCI.DAG;
5578 SDLoc DL(N);
5579
5580 switch(N->getOpcode()) {
5581 default:
5582 break;
5583 case ISD::BITCAST: {
5584 EVT DestVT = N->getValueType(0);
5585
5586 // Push casts through vector builds. This helps avoid emitting a large
5587 // number of copies when materializing floating point vector constants.
5588 //
5589 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5590 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5591 if (DestVT.isVector()) {
5592 SDValue Src = N->getOperand(0);
5593 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5596 EVT SrcVT = Src.getValueType();
5597 unsigned NElts = DestVT.getVectorNumElements();
5598
5599 if (SrcVT.getVectorNumElements() == NElts) {
5600 EVT DestEltVT = DestVT.getVectorElementType();
5601
5602 SmallVector<SDValue, 8> CastedElts;
5603 SDLoc SL(N);
5604 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5605 SDValue Elt = Src.getOperand(I);
5606 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5607 }
5608
5609 return DAG.getBuildVector(DestVT, SL, CastedElts);
5610 }
5611 }
5612 }
5613
5614 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5615 break;
5616
5617 // Fold bitcasts of constants.
5618 //
5619 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5620 // TODO: Generalize and move to DAGCombiner
5621 SDValue Src = N->getOperand(0);
5623 SDLoc SL(N);
5624 if (isInt64ImmLegal(C, DAG))
5625 break;
5626 uint64_t CVal = C->getZExtValue();
5627 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5628 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5629 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5630 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5631 }
5632
5634 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5635 SDLoc SL(N);
5636 if (isInt64ImmLegal(C, DAG))
5637 break;
5638 uint64_t CVal = Val.getZExtValue();
5639 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5640 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5641 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5642
5643 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5644 }
5645
5646 break;
5647 }
5648 case ISD::SHL:
5649 case ISD::SRA:
5650 case ISD::SRL: {
5651 // Range metadata can be invalidated when loads are converted to legal types
5652 // (e.g. v2i64 -> v4i32).
5653 // Try to convert vector shl/sra/srl before type legalization so that range
5654 // metadata can be utilized.
5655 if (!(N->getValueType(0).isVector() &&
5658 break;
5659 if (N->getOpcode() == ISD::SHL)
5660 return performShlCombine(N, DCI);
5661 if (N->getOpcode() == ISD::SRA)
5662 return performSraCombine(N, DCI);
5663 return performSrlCombine(N, DCI);
5664 }
5665 case ISD::TRUNCATE:
5666 return performTruncateCombine(N, DCI);
5667 case ISD::MUL:
5668 return performMulCombine(N, DCI);
5669 case AMDGPUISD::MUL_U24:
5670 case AMDGPUISD::MUL_I24: {
5671 if (SDValue Simplified = simplifyMul24(N, DCI))
5672 return Simplified;
5673 break;
5674 }
5675 case AMDGPUISD::MULHI_I24:
5676 case AMDGPUISD::MULHI_U24:
5677 return simplifyMul24(N, DCI);
5678 case ISD::SMUL_LOHI:
5679 case ISD::UMUL_LOHI:
5680 return performMulLoHiCombine(N, DCI);
5681 case ISD::MULHS:
5682 return performMulhsCombine(N, DCI);
5683 case ISD::MULHU:
5684 return performMulhuCombine(N, DCI);
5685 case ISD::SELECT:
5686 return performSelectCombine(N, DCI);
5687 case ISD::FNEG:
5688 return performFNegCombine(N, DCI);
5689 case ISD::FABS:
5690 return performFAbsCombine(N, DCI);
5691 case AMDGPUISD::BFE_I32:
5692 case AMDGPUISD::BFE_U32: {
5693 assert(!N->getValueType(0).isVector() &&
5694 "Vector handling of BFE not implemented");
5695 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5696 if (!Width)
5697 break;
5698
5699 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5700 if (WidthVal == 0)
5701 return DAG.getConstant(0, DL, MVT::i32);
5702
5704 if (!Offset)
5705 break;
5706
5707 SDValue BitsFrom = N->getOperand(0);
5708 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5709
5710 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5711
5712 if (OffsetVal == 0) {
5713 // This is already sign / zero extended, so try to fold away extra BFEs.
5714 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5715
5716 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5717 if (OpSignBits >= SignBits)
5718 return BitsFrom;
5719
5720 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5721 if (Signed) {
5722 // This is a sign_extend_inreg. Replace it to take advantage of existing
5723 // DAG Combines. If not eliminated, we will match back to BFE during
5724 // selection.
5725
5726 // TODO: The sext_inreg of extended types ends, although we can could
5727 // handle them in a single BFE.
5728 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5729 DAG.getValueType(SmallVT));
5730 }
5731
5732 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5733 }
5734
5735 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5736 if (Signed) {
5737 return constantFoldBFE<int32_t>(DAG,
5738 CVal->getSExtValue(),
5739 OffsetVal,
5740 WidthVal,
5741 DL);
5742 }
5743
5744 return constantFoldBFE<uint32_t>(DAG,
5745 CVal->getZExtValue(),
5746 OffsetVal,
5747 WidthVal,
5748 DL);
5749 }
5750
5751 if ((OffsetVal + WidthVal) >= 32 &&
5752 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5753 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5754 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5755 BitsFrom, ShiftVal);
5756 }
5757
5758 if (BitsFrom.hasOneUse()) {
5759 APInt Demanded = APInt::getBitsSet(32,
5760 OffsetVal,
5761 OffsetVal + WidthVal);
5762
5763 KnownBits Known;
5765 !DCI.isBeforeLegalizeOps());
5766 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5767 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5768 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5769 DCI.CommitTargetLoweringOpt(TLO);
5770 }
5771 }
5772
5773 break;
5774 }
5775 case ISD::LOAD:
5776 return performLoadCombine(N, DCI);
5777 case ISD::STORE:
5778 return performStoreCombine(N, DCI);
5779 case AMDGPUISD::RCP:
5780 case AMDGPUISD::RCP_IFLAG:
5781 return performRcpCombine(N, DCI);
5782 case ISD::AssertZext:
5783 case ISD::AssertSext:
5784 return performAssertSZExtCombine(N, DCI);
5786 return performIntrinsicWOChainCombine(N, DCI);
5787 case AMDGPUISD::FMAD_FTZ: {
5788 SDValue N0 = N->getOperand(0);
5789 SDValue N1 = N->getOperand(1);
5790 SDValue N2 = N->getOperand(2);
5791 EVT VT = N->getValueType(0);
5792
5793 // FMAD_FTZ is a FMAD + flush denormals to zero.
5794 // We flush the inputs, the intermediate step, and the output.
5798 if (N0CFP && N1CFP && N2CFP) {
5799 const auto FTZ = [](const APFloat &V) {
5800 if (V.isDenormal()) {
5801 APFloat Zero(V.getSemantics(), 0);
5802 return V.isNegative() ? -Zero : Zero;
5803 }
5804 return V;
5805 };
5806
5807 APFloat V0 = FTZ(N0CFP->getValueAPF());
5808 APFloat V1 = FTZ(N1CFP->getValueAPF());
5809 APFloat V2 = FTZ(N2CFP->getValueAPF());
5811 V0 = FTZ(V0);
5813 return DAG.getConstantFP(FTZ(V0), DL, VT);
5814 }
5815 break;
5816 }
5817 }
5818 return SDValue();
5819}
5820
5822 SDValue Op, const APInt &OriginalDemandedBits,
5823 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
5824 unsigned Depth) const {
5825 switch (Op.getOpcode()) {
5827 switch (Op.getConstantOperandVal(0)) {
5828 case Intrinsic::amdgcn_readfirstlane:
5829 case Intrinsic::amdgcn_readlane:
5830 case Intrinsic::amdgcn_set_inactive:
5831 case Intrinsic::amdgcn_wwm: {
5832 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
5833 OriginalDemandedElts, Known, TLO, Depth + 1))
5834 return true;
5835 break;
5836 }
5837 default:
5838 break;
5839 }
5840 break;
5841 }
5842 default:
5843 break;
5844 }
5845
5846 return false;
5847}
5848
5849//===----------------------------------------------------------------------===//
5850// Helper functions
5851//===----------------------------------------------------------------------===//
5852
5854 const TargetRegisterClass *RC,
5855 Register Reg, EVT VT,
5856 const SDLoc &SL,
5857 bool RawReg) const {
5859 MachineRegisterInfo &MRI = MF.getRegInfo();
5860 Register VReg;
5861
5862 if (!MRI.isLiveIn(Reg)) {
5863 VReg = MRI.createVirtualRegister(RC);
5864 MRI.addLiveIn(Reg, VReg);
5865 } else {
5866 VReg = MRI.getLiveInVirtReg(Reg);
5867 }
5868
5869 if (RawReg)
5870 return DAG.getRegister(VReg, VT);
5871
5872 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5873}
5874
5875// This may be called multiple times, and nothing prevents creating multiple
5876// objects at the same offset. See if we already defined this object.
5878 int64_t Offset) {
5879 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5880 if (MFI.getObjectOffset(I) == Offset) {
5881 assert(MFI.getObjectSize(I) == Size);
5882 return I;
5883 }
5884 }
5885
5886 return MFI.CreateFixedObject(Size, Offset, true);
5887}
5888
5890 EVT VT,
5891 const SDLoc &SL,
5892 int64_t Offset) const {
5894 MachineFrameInfo &MFI = MF.getFrameInfo();
5895 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5896
5897 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5898 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5899
5900 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5903}
5904
5906 const SDLoc &SL,
5907 SDValue Chain,
5908 SDValue ArgVal,
5909 int64_t Offset) const {
5913
5914 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5915 // Stores to the argument stack area are relative to the stack pointer.
5916 SDValue SP =
5917 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5918 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5919 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5921 return Store;
5922}
5923
5925 const TargetRegisterClass *RC,
5926 EVT VT, const SDLoc &SL,
5927 const ArgDescriptor &Arg) const {
5928 assert(Arg && "Attempting to load missing argument");
5929
5930 SDValue V = Arg.isRegister() ?
5931 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5932 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5933
5934 if (!Arg.isMasked())
5935 return V;
5936
5937 unsigned Mask = Arg.getMask();
5938 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5939 V = DAG.getNode(ISD::SRL, SL, VT, V,
5940 DAG.getShiftAmountConstant(Shift, VT, SL));
5941 return DAG.getNode(ISD::AND, SL, VT, V,
5942 DAG.getConstant(Mask >> Shift, SL, VT));
5943}
5944
5946 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5947 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5948 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5949 uint64_t ArgOffset =
5950 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5951 switch (Param) {
5952 case FIRST_IMPLICIT:
5953 return ArgOffset;
5954 case PRIVATE_BASE:
5956 case SHARED_BASE:
5957 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5958 case QUEUE_PTR:
5959 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5960 }
5961 llvm_unreachable("unexpected implicit parameter type");
5962}
5963
5970
5972 SelectionDAG &DAG, int Enabled,
5973 int &RefinementSteps,
5974 bool &UseOneConstNR,
5975 bool Reciprocal) const {
5976 EVT VT = Operand.getValueType();
5977
5978 if (VT == MVT::f32) {
5979 RefinementSteps = 0;
5980 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5981 }
5982
5983 // TODO: There is also f64 rsq instruction, but the documentation is less
5984 // clear on its precision.
5985
5986 return SDValue();
5987}
5988
5990 SelectionDAG &DAG, int Enabled,
5991 int &RefinementSteps) const {
5992 EVT VT = Operand.getValueType();
5993
5994 if (VT == MVT::f32) {
5995 // Reciprocal, < 1 ulp error.
5996 //
5997 // This reciprocal approximation converges to < 0.5 ulp error with one
5998 // newton rhapson performed with two fused multiple adds (FMAs).
5999
6000 RefinementSteps = 0;
6001 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
6002 }
6003
6004 // TODO: There is also f64 rcp instruction, but the documentation is less
6005 // clear on its precision.
6006
6007 return SDValue();
6008}
6009
6010static unsigned workitemIntrinsicDim(unsigned ID) {
6011 switch (ID) {
6012 case Intrinsic::amdgcn_workitem_id_x:
6013 return 0;
6014 case Intrinsic::amdgcn_workitem_id_y:
6015 return 1;
6016 case Intrinsic::amdgcn_workitem_id_z:
6017 return 2;
6018 default:
6019 llvm_unreachable("not a workitem intrinsic");
6020 }
6021}
6022
6024 const SDValue Op, KnownBits &Known,
6025 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
6026
6027 Known.resetAll(); // Don't know anything.
6028
6029 unsigned Opc = Op.getOpcode();
6030
6031 switch (Opc) {
6032 default:
6033 break;
6034 case AMDGPUISD::CARRY:
6035 case AMDGPUISD::BORROW: {
6036 Known.Zero = APInt::getHighBitsSet(32, 31);
6037 break;
6038 }
6039
6040 case AMDGPUISD::BFE_I32:
6041 case AMDGPUISD::BFE_U32: {
6042 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6043 if (!CWidth)
6044 return;
6045
6046 uint32_t Width = CWidth->getZExtValue() & 0x1f;
6047
6048 if (Opc == AMDGPUISD::BFE_U32)
6049 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
6050
6051 break;
6052 }
6053 case AMDGPUISD::FP_TO_FP16: {
6054 unsigned BitWidth = Known.getBitWidth();
6055
6056 // High bits are zero.
6058 break;
6059 }
6060 case AMDGPUISD::MUL_U24:
6061 case AMDGPUISD::MUL_I24: {
6062 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6063 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6064 unsigned BitWidth = Op.getScalarValueSizeInBits();
6065
6066 // Sign/Zero extend from 24 bits.
6067 if (Opc == AMDGPUISD::MUL_I24) {
6068 LHSKnown = LHSKnown.trunc(24).sext(BitWidth);
6069 RHSKnown = RHSKnown.trunc(24).sext(BitWidth);
6070 } else {
6071 LHSKnown = LHSKnown.trunc(24).zext(BitWidth);
6072 RHSKnown = RHSKnown.trunc(24).zext(BitWidth);
6073 }
6074
6075 // TODO: SelfMultiply can be poison, but not undef.
6076 bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
6077 if (SelfMultiply)
6078 SelfMultiply &= DAG.isGuaranteedNotToBeUndefOrPoison(
6079 Op.getOperand(0), DemandedElts, UndefPoisonKind::UndefOrPoison,
6080 Depth + 1);
6081
6082 Known = KnownBits::mul(LHSKnown, RHSKnown, SelfMultiply);
6083 break;
6084 }
6085 case AMDGPUISD::PERM: {
6086 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6087 if (!CMask)
6088 return;
6089
6090 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6091 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6092 unsigned Sel = CMask->getZExtValue();
6093
6094 for (unsigned I = 0; I < 32; I += 8) {
6095 unsigned SelBits = Sel & 0xff;
6096 if (SelBits < 4) {
6097 SelBits *= 8;
6098 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6099 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6100 } else if (SelBits < 7) {
6101 SelBits = (SelBits & 3) * 8;
6102 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6103 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6104 } else if (SelBits == 0x0c) {
6105 Known.Zero |= 0xFFull << I;
6106 } else if (SelBits > 0x0c) {
6107 Known.One |= 0xFFull << I;
6108 }
6109 Sel >>= 8;
6110 }
6111 break;
6112 }
6113 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6114 Known.Zero.setHighBits(24);
6115 break;
6116 }
6117 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6118 Known.Zero.setHighBits(16);
6119 break;
6120 }
6121 case AMDGPUISD::LDS: {
6122 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
6123 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
6124
6125 Known.Zero.setHighBits(16);
6126 Known.Zero.setLowBits(Log2(Alignment));
6127 break;
6128 }
6129 case AMDGPUISD::SMIN3:
6130 case AMDGPUISD::SMAX3:
6131 case AMDGPUISD::SMED3:
6132 case AMDGPUISD::UMIN3:
6133 case AMDGPUISD::UMAX3:
6134 case AMDGPUISD::UMED3: {
6135 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6136 if (Known2.isUnknown())
6137 break;
6138
6139 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6140 if (Known1.isUnknown())
6141 break;
6142
6143 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6144 if (Known0.isUnknown())
6145 break;
6146
6147 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6148 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6149 Known.One = Known0.One & Known1.One & Known2.One;
6150 break;
6151 }
6153 unsigned IID = Op.getConstantOperandVal(0);
6154 switch (IID) {
6155 case Intrinsic::amdgcn_workitem_id_x:
6156 case Intrinsic::amdgcn_workitem_id_y:
6157 case Intrinsic::amdgcn_workitem_id_z: {
6158 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6160 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6161 break;
6162 }
6163 default:
6164 break;
6165 }
6166 }
6167 }
6168}
6169
6171 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6172 unsigned Depth) const {
6173 switch (Op.getOpcode()) {
6174 case AMDGPUISD::BFE_I32: {
6175 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6176 if (!Width)
6177 return 1;
6178
6179 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6180 if (!isNullConstant(Op.getOperand(1)))
6181 return SignBits;
6182
6183 // TODO: Could probably figure something out with non-0 offsets.
6184 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6185 return std::max(SignBits, Op0SignBits);
6186 }
6187
6188 case AMDGPUISD::BFE_U32: {
6189 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6190 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6191 }
6192
6193 case AMDGPUISD::CARRY:
6194 case AMDGPUISD::BORROW:
6195 return 31;
6196 case AMDGPUISD::BUFFER_LOAD_BYTE:
6197 return 25;
6198 case AMDGPUISD::BUFFER_LOAD_SHORT:
6199 return 17;
6200 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6201 return 24;
6202 case AMDGPUISD::BUFFER_LOAD_USHORT:
6203 return 16;
6204 case AMDGPUISD::FP_TO_FP16:
6205 return 16;
6206 case AMDGPUISD::SMIN3:
6207 case AMDGPUISD::SMAX3:
6208 case AMDGPUISD::SMED3:
6209 case AMDGPUISD::UMIN3:
6210 case AMDGPUISD::UMAX3:
6211 case AMDGPUISD::UMED3: {
6212 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6213 if (Tmp2 == 1)
6214 return 1; // Early out.
6215
6216 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6217 if (Tmp1 == 1)
6218 return 1; // Early out.
6219
6220 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6221 if (Tmp0 == 1)
6222 return 1; // Early out.
6223
6224 return std::min({Tmp0, Tmp1, Tmp2});
6225 }
6226 default:
6227 return 1;
6228 }
6229}
6230
6232 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6233 const MachineRegisterInfo &MRI, unsigned Depth) const {
6234 const MachineInstr *MI = MRI.getVRegDef(R);
6235 if (!MI)
6236 return 1;
6237
6238 // TODO: Check range metadata on MMO.
6239 switch (MI->getOpcode()) {
6240 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6241 return 25;
6242 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6243 return 17;
6244 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6245 return 24;
6246 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6247 return 16;
6248 case AMDGPU::G_AMDGPU_SMED3:
6249 case AMDGPU::G_AMDGPU_UMED3: {
6250 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6251 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6252 if (Tmp2 == 1)
6253 return 1;
6254 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6255 if (Tmp1 == 1)
6256 return 1;
6257 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6258 if (Tmp0 == 1)
6259 return 1;
6260 return std::min({Tmp0, Tmp1, Tmp2});
6261 }
6262 default:
6263 return 1;
6264 }
6265}
6266
6268 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6269 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
6270 unsigned Opcode = Op.getOpcode();
6271 switch (Opcode) {
6272 case AMDGPUISD::BFE_I32:
6273 case AMDGPUISD::BFE_U32:
6274 return false;
6275 }
6277 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
6278}
6279
6281 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6282 unsigned Depth) const {
6283 unsigned Opcode = Op.getOpcode();
6284 switch (Opcode) {
6285 case AMDGPUISD::FMIN_LEGACY:
6286 case AMDGPUISD::FMAX_LEGACY: {
6287 if (SNaN)
6288 return true;
6289
6290 // TODO: Can check no nans on one of the operands for each one, but which
6291 // one?
6292 return false;
6293 }
6294 case AMDGPUISD::FMUL_LEGACY:
6295 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6296 if (SNaN)
6297 return true;
6298 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6299 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6300 }
6301 case AMDGPUISD::FMED3:
6302 case AMDGPUISD::FMIN3:
6303 case AMDGPUISD::FMAX3:
6304 case AMDGPUISD::FMINIMUM3:
6305 case AMDGPUISD::FMAXIMUM3:
6306 case AMDGPUISD::FMAD_FTZ: {
6307 if (SNaN)
6308 return true;
6309 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6310 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6311 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6312 }
6313 case AMDGPUISD::CVT_F32_UBYTE0:
6314 case AMDGPUISD::CVT_F32_UBYTE1:
6315 case AMDGPUISD::CVT_F32_UBYTE2:
6316 case AMDGPUISD::CVT_F32_UBYTE3:
6317 return true;
6318
6319 case AMDGPUISD::RCP:
6320 case AMDGPUISD::RSQ:
6321 case AMDGPUISD::RCP_LEGACY:
6322 case AMDGPUISD::RSQ_CLAMP: {
6323 if (SNaN)
6324 return true;
6325
6326 // TODO: Need is known positive check.
6327 return false;
6328 }
6329 case ISD::FLDEXP:
6330 case AMDGPUISD::FRACT: {
6331 if (SNaN)
6332 return true;
6333 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6334 }
6335 case AMDGPUISD::DIV_SCALE:
6336 case AMDGPUISD::DIV_FMAS:
6337 case AMDGPUISD::DIV_FIXUP:
6338 // TODO: Refine on operands.
6339 return SNaN;
6340 case AMDGPUISD::SIN_HW:
6341 case AMDGPUISD::COS_HW: {
6342 // TODO: Need check for infinity
6343 return SNaN;
6344 }
6346 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6347 // TODO: Handle more intrinsics
6348 switch (IntrinsicID) {
6349 case Intrinsic::amdgcn_cubeid:
6350 case Intrinsic::amdgcn_cvt_off_f32_i4:
6351 return true;
6352
6353 case Intrinsic::amdgcn_frexp_mant: {
6354 if (SNaN)
6355 return true;
6356 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6357 }
6358 case Intrinsic::amdgcn_cvt_pkrtz: {
6359 if (SNaN)
6360 return true;
6361 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6362 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6363 }
6364 case Intrinsic::amdgcn_rcp:
6365 case Intrinsic::amdgcn_rsq:
6366 case Intrinsic::amdgcn_rcp_legacy:
6367 case Intrinsic::amdgcn_rsq_legacy:
6368 case Intrinsic::amdgcn_rsq_clamp:
6369 case Intrinsic::amdgcn_tanh: {
6370 if (SNaN)
6371 return true;
6372
6373 // TODO: Need is known positive check.
6374 return false;
6375 }
6376 case Intrinsic::amdgcn_trig_preop:
6377 case Intrinsic::amdgcn_fdot2:
6378 // TODO: Refine on operand
6379 return SNaN;
6380 case Intrinsic::amdgcn_fma_legacy:
6381 if (SNaN)
6382 return true;
6383 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6384 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6385 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6386 default:
6387 return false;
6388 }
6389 }
6390 default:
6391 return false;
6392 }
6393}
6394
6396 Register N0, Register N1) const {
6397 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6398}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue lowerFEXPF64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const
Check whether value Val can be supported by v_mov_b64, for the current target.
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1503
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1240
const fltSemantics & getSemantics() const
Definition APFloat.h:1546
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1258
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
APInt bitcastToAPInt() const
Definition APFloat.h:1430
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1411
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, UndefPoisonKind Kind=UndefPoisonKind::UndefOrPoison, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, Kind can be used to track poison ...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:788
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:792
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:787
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:557
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1666
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:403
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:493
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:479
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:251
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:438
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:486
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:420
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:389
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:427
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:300
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:150
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:316
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...