LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f32, MVT::i64);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
182
184 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
185
187 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f32, MVT::i64);
188
189 // There are no 64-bit extloads. These should be done as a 32-bit extload and
190 // an extension to 64-bit.
191 for (MVT VT : MVT::integer_valuetypes())
193 Expand);
194
195 for (MVT VT : MVT::integer_valuetypes()) {
196 if (VT == MVT::i64)
197 continue;
198
199 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
200 setLoadExtAction(Op, VT, MVT::i1, Promote);
201 setLoadExtAction(Op, VT, MVT::i8, Legal);
202 setLoadExtAction(Op, VT, MVT::i16, Legal);
203 setLoadExtAction(Op, VT, MVT::i32, Expand);
204 }
205 }
206
208 for (auto MemVT :
209 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
211 Expand);
212
213 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
227
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
234
235 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
241 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
242 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
243 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
244 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
245 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
246 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
247
249 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
283
285 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
289
291 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
319
321 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
322
324 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
325
327 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
328
329 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
330 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
331 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
332 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
333
334 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
335 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
336 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
337 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
338
339 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
340 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
341 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
342 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
343 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
344 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
345 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
346 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
347 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
348 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
349 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
350 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
351 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
352 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
353 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
354
355 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
356 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
357 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
358
359 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
360 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
361 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
362
363 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
364
365 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
366 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
367 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
368 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
369 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
370 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
371 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
372
373 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
374 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
375 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
376 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
377 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
378
379 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
380 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
381 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
382
383 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
384 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
385 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
386
387 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
388 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
389 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
390
391 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
392 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
393 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
394
395 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
396 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
397 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
398 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
399 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
400 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
401 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
402
403 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
404 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
405
407
408 // For R600, this is totally unsupported, just custom lower to produce an
409 // error.
411
412 // Library functions. These default to Expand, but we have instructions
413 // for them.
416 {MVT::f16, MVT::f32}, Legal);
418
420 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
422 {MVT::f16, MVT::f32, MVT::f64}, Expand);
423
426 Custom);
428
429 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
430
431 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
432
433 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
434 Expand);
435
436 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
437 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
439
441 Custom);
442
443 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
444
445 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
446 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
447 // default unless marked custom/legal.
449 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
450 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
451 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
452 MVT::v16f64},
453 Custom);
454
455 // Expand to fneg + fadd.
457
459 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
460 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 Custom);
465
468 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
469 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
470 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
471 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
472 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
473 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
474 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
475 Custom);
476
478 Expand);
479 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
480
481 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
482 for (MVT VT : ScalarIntVTs) {
483 // These should use [SU]DIVREM, so set them to expand
485 Expand);
486
487 // GPU does not have divrem function for signed or unsigned.
489
490 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
492
494
495 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
497 }
498
499 // The hardware supports 32-bit FSHR, but not FSHL.
501
502 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
503
505
510 MVT::i64, Custom);
512
514 Legal);
515
518 MVT::i64, Custom);
519
520 for (auto VT : {MVT::i8, MVT::i16})
522
523 static const MVT::SimpleValueType VectorIntTypes[] = {
524 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
525 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
526
527 for (MVT VT : VectorIntTypes) {
528 // Expand the following operations for the current type by default.
529 // clang-format off
549 VT, Expand);
550 // clang-format on
551 }
552
553 static const MVT::SimpleValueType FloatVectorTypes[] = {
554 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
555 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
556
557 for (MVT VT : FloatVectorTypes) {
570 VT, Expand);
571 }
572
573 // This causes using an unrolled select operation rather than expansion with
574 // bit operations. This is in general better, but the alternative using BFI
575 // instructions may be better if the select sources are SGPRs.
577 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
581
583 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
584
586 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
587
589 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
590
592 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
593
595 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
596
598 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
599
601 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
602
604 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
605
607 setJumpIsExpensive(true);
608
611
613
614 // We want to find all load dependencies for long chains of stores to enable
615 // merging into very wide vectors. The problem is with vectors with > 4
616 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
617 // vectors are a legal type, even though we have to split the loads
618 // usually. When we can more precisely specify load legality per address
619 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
620 // smarter so that they can figure out what to do in 2 iterations without all
621 // N > 4 stores on the same chain.
623
624 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
625 // about these during lowering.
626 MaxStoresPerMemcpy = 0xffffffff;
627 MaxStoresPerMemmove = 0xffffffff;
628 MaxStoresPerMemset = 0xffffffff;
629
630 // The expansion for 64-bit division is enormous.
632 addBypassSlowDiv(64, 32);
633
644
648}
649
651 const auto Flags = Op.getNode()->getFlags();
652 if (Flags.hasNoSignedZeros())
653 return true;
654
655 return false;
656}
657
658//===----------------------------------------------------------------------===//
659// Target Information
660//===----------------------------------------------------------------------===//
661
663static bool fnegFoldsIntoOpcode(unsigned Opc) {
664 switch (Opc) {
665 case ISD::FADD:
666 case ISD::FSUB:
667 case ISD::FMUL:
668 case ISD::FMA:
669 case ISD::FMAD:
670 case ISD::FMINNUM:
671 case ISD::FMAXNUM:
674 case ISD::FMINIMUM:
675 case ISD::FMAXIMUM:
676 case ISD::FMINIMUMNUM:
677 case ISD::FMAXIMUMNUM:
678 case ISD::SELECT:
679 case ISD::FSIN:
680 case ISD::FTRUNC:
681 case ISD::FRINT:
682 case ISD::FNEARBYINT:
683 case ISD::FROUNDEVEN:
685 case AMDGPUISD::RCP:
686 case AMDGPUISD::RCP_LEGACY:
687 case AMDGPUISD::RCP_IFLAG:
688 case AMDGPUISD::SIN_HW:
689 case AMDGPUISD::FMUL_LEGACY:
690 case AMDGPUISD::FMIN_LEGACY:
691 case AMDGPUISD::FMAX_LEGACY:
692 case AMDGPUISD::FMED3:
693 // TODO: handle llvm.amdgcn.fma.legacy
694 return true;
695 case ISD::BITCAST:
696 llvm_unreachable("bitcast is special cased");
697 default:
698 return false;
699 }
700}
701
702static bool fnegFoldsIntoOp(const SDNode *N) {
703 unsigned Opc = N->getOpcode();
704 if (Opc == ISD::BITCAST) {
705 // TODO: Is there a benefit to checking the conditions performFNegCombine
706 // does? We don't for the other cases.
707 SDValue BCSrc = N->getOperand(0);
708 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
709 return BCSrc.getNumOperands() == 2 &&
710 BCSrc.getOperand(1).getValueSizeInBits() == 32;
711 }
712
713 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
714 }
715
716 return fnegFoldsIntoOpcode(Opc);
717}
718
719/// \p returns true if the operation will definitely need to use a 64-bit
720/// encoding, and thus will use a VOP3 encoding regardless of the source
721/// modifiers.
723static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
724 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
725 VT == MVT::f64;
726}
727
728/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
729/// type for ISD::SELECT.
731static bool selectSupportsSourceMods(const SDNode *N) {
732 // TODO: Only applies if select will be vector
733 return N->getValueType(0) == MVT::f32;
734}
735
736// Most FP instructions support source modifiers, but this could be refined
737// slightly.
739static bool hasSourceMods(const SDNode *N) {
740 if (isa<MemSDNode>(N))
741 return false;
742
743 switch (N->getOpcode()) {
744 case ISD::CopyToReg:
745 case ISD::FDIV:
746 case ISD::FREM:
747 case ISD::INLINEASM:
749 case AMDGPUISD::DIV_SCALE:
751
752 // TODO: Should really be looking at the users of the bitcast. These are
753 // problematic because bitcasts are used to legalize all stores to integer
754 // types.
755 case ISD::BITCAST:
756 return false;
758 switch (N->getConstantOperandVal(0)) {
759 case Intrinsic::amdgcn_interp_p1:
760 case Intrinsic::amdgcn_interp_p2:
761 case Intrinsic::amdgcn_interp_mov:
762 case Intrinsic::amdgcn_interp_p1_f16:
763 case Intrinsic::amdgcn_interp_p2_f16:
764 return false;
765 default:
766 return true;
767 }
768 }
769 case ISD::SELECT:
771 default:
772 return true;
773 }
774}
775
777 unsigned CostThreshold) {
778 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
779 // it is truly free to use a source modifier in all cases. If there are
780 // multiple users but for each one will necessitate using VOP3, there will be
781 // a code size increase. Try to avoid increasing code size unless we know it
782 // will save on the instruction count.
783 unsigned NumMayIncreaseSize = 0;
784 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
785
786 assert(!N->use_empty());
787
788 // XXX - Should this limit number of uses to check?
789 for (const SDNode *U : N->users()) {
790 if (!hasSourceMods(U))
791 return false;
792
793 if (!opMustUseVOP3Encoding(U, VT)) {
794 if (++NumMayIncreaseSize > CostThreshold)
795 return false;
796 }
797 }
798
799 return true;
800}
801
803 ISD::NodeType ExtendKind) const {
804 assert(!VT.isVector() && "only scalar expected");
805
806 // Round to the next multiple of 32-bits.
807 unsigned Size = VT.getSizeInBits();
808 if (Size <= 32)
809 return MVT::i32;
810 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
811}
812
814 return 32;
815}
816
818 return true;
819}
820
821// The backend supports 32 and 64 bit floating point immediates.
822// FIXME: Why are we reporting vectors of FP immediates as legal?
824 bool ForCodeSize) const {
825 return isTypeLegal(VT.getScalarType());
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 // Report this based on the end legalized type.
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ADD:
1045 case ISD::SUB:
1046 case ISD::SHL:
1047 case ISD::SRL:
1048 case ISD::SRA:
1049 case ISD::AND:
1050 case ISD::OR:
1051 case ISD::XOR:
1052 case ISD::MUL:
1053 case ISD::SETCC:
1054 case ISD::SELECT:
1055 case ISD::SMIN:
1056 case ISD::SMAX:
1057 case ISD::UMIN:
1058 case ISD::UMAX:
1059 if (isTypeLegal(MVT::i16) &&
1060 (!DestVT.isVector() ||
1061 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1062 // Don't narrow back down to i16 if promoted to i32 already.
1063 if (!N->isDivergent() && DestVT.isInteger() &&
1064 DestVT.getScalarSizeInBits() > 1 &&
1065 DestVT.getScalarSizeInBits() <= 16 &&
1066 SrcVT.getScalarSizeInBits() > 16) {
1067 return false;
1068 }
1069 }
1070 return true;
1071 default:
1072 break;
1073 }
1074
1075 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1076 // limited number of native 64-bit operations. Shrinking an operation to fit
1077 // in a single 32-bit register should always be helpful. As currently used,
1078 // this is much less general than the name suggests, and is only used in
1079 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1080 // not profitable, and may actually be harmful.
1081 if (isa<LoadSDNode>(N))
1082 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1083
1084 return true;
1085}
1086
1088 const SDNode* N, CombineLevel Level) const {
1089 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1090 N->getOpcode() == ISD::SRL) &&
1091 "Expected shift op");
1092
1093 SDValue ShiftLHS = N->getOperand(0);
1094 if (!ShiftLHS->hasOneUse())
1095 return false;
1096
1097 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1098 !ShiftLHS.getOperand(0)->hasOneUse())
1099 return false;
1100
1101 // Always commute pre-type legalization and right shifts.
1102 // We're looking for shl(or(x,y),z) patterns.
1104 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1105 return true;
1106
1107 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1108 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1109 (N->user_begin()->getOpcode() == ISD::SRA ||
1110 N->user_begin()->getOpcode() == ISD::SRL))
1111 return false;
1112
1113 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1114 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1115 if (LHS.getOpcode() != ISD::SHL)
1116 return false;
1117 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1118 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1119 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1120 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1121 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1122 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1123 };
1124 SDValue LHS = N->getOperand(0).getOperand(0);
1125 SDValue RHS = N->getOperand(0).getOperand(1);
1126 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1127}
1128
1129//===---------------------------------------------------------------------===//
1130// TargetLowering Callbacks
1131//===---------------------------------------------------------------------===//
1132
1134 bool IsVarArg) {
1135 switch (CC) {
1143 return CC_AMDGPU;
1146 return CC_AMDGPU_CS_CHAIN;
1147 case CallingConv::C:
1148 case CallingConv::Fast:
1149 case CallingConv::Cold:
1150 return CC_AMDGPU_Func;
1153 return CC_SI_Gfx;
1156 default:
1157 reportFatalUsageError("unsupported calling convention for call");
1158 }
1159}
1160
1162 bool IsVarArg) {
1163 switch (CC) {
1166 llvm_unreachable("kernels should not be handled here");
1176 return RetCC_SI_Shader;
1179 return RetCC_SI_Gfx;
1180 case CallingConv::C:
1181 case CallingConv::Fast:
1182 case CallingConv::Cold:
1183 return RetCC_AMDGPU_Func;
1184 default:
1185 reportFatalUsageError("unsupported calling convention");
1186 }
1187}
1188
1189/// The SelectionDAGBuilder will automatically promote function arguments
1190/// with illegal types. However, this does not work for the AMDGPU targets
1191/// since the function arguments are stored in memory as these illegal types.
1192/// In order to handle this properly we need to get the original types sizes
1193/// from the LLVM IR Function and fixup the ISD:InputArg values before
1194/// passing them to AnalyzeFormalArguments()
1195
1196/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1197/// input values across multiple registers. Each item in the Ins array
1198/// represents a single value that will be stored in registers. Ins[x].VT is
1199/// the value type of the value that will be stored in the register, so
1200/// whatever SDNode we lower the argument to needs to be this type.
1201///
1202/// In order to correctly lower the arguments we need to know the size of each
1203/// argument. Since Ins[x].VT gives us the size of the register that will
1204/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1205/// for the original function argument so that we can deduce the correct memory
1206/// type to use for Ins[x]. In most cases the correct memory type will be
1207/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1208/// we have a kernel argument of type v8i8, this argument will be split into
1209/// 8 parts and each part will be represented by its own item in the Ins array.
1210/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1211/// the argument before it was split. From this, we deduce that the memory type
1212/// for each individual part is i8. We pass the memory type as LocVT to the
1213/// calling convention analysis function and the register type (Ins[x].VT) as
1214/// the ValVT.
1216 CCState &State,
1217 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1218 const MachineFunction &MF = State.getMachineFunction();
1219 const Function &Fn = MF.getFunction();
1220 LLVMContext &Ctx = Fn.getContext();
1221 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1223
1224 Align MaxAlign = Align(1);
1225 uint64_t ExplicitArgOffset = 0;
1226 const DataLayout &DL = Fn.getDataLayout();
1227
1228 unsigned InIndex = 0;
1229
1230 for (const Argument &Arg : Fn.args()) {
1231 const bool IsByRef = Arg.hasByRefAttr();
1232 Type *BaseArgTy = Arg.getType();
1233 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1234 Align Alignment = DL.getValueOrABITypeAlignment(
1235 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1236 MaxAlign = std::max(Alignment, MaxAlign);
1237 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1238
1239 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1240 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1241
1242 // We're basically throwing away everything passed into us and starting over
1243 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1244 // to us as computed in Ins.
1245 //
1246 // We also need to figure out what type legalization is trying to do to get
1247 // the correct memory offsets.
1248
1249 SmallVector<EVT, 16> ValueVTs;
1251 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1252 &Offsets, ArgOffset);
1253
1254 for (unsigned Value = 0, NumValues = ValueVTs.size();
1255 Value != NumValues; ++Value) {
1256 uint64_t BasePartOffset = Offsets[Value];
1257
1258 EVT ArgVT = ValueVTs[Value];
1259 EVT MemVT = ArgVT;
1260 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1261 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1262
1263 if (NumRegs == 1) {
1264 // This argument is not split, so the IR type is the memory type.
1265 if (ArgVT.isExtended()) {
1266 // We have an extended type, like i24, so we should just use the
1267 // register type.
1268 MemVT = RegisterVT;
1269 } else {
1270 MemVT = ArgVT;
1271 }
1272 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1273 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1274 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1275 // We have a vector value which has been split into a vector with
1276 // the same scalar type, but fewer elements. This should handle
1277 // all the floating-point vector types.
1278 MemVT = RegisterVT;
1279 } else if (ArgVT.isVector() &&
1280 ArgVT.getVectorNumElements() == NumRegs) {
1281 // This arg has been split so that each element is stored in a separate
1282 // register.
1283 MemVT = ArgVT.getScalarType();
1284 } else if (ArgVT.isExtended()) {
1285 // We have an extended type, like i65.
1286 MemVT = RegisterVT;
1287 } else {
1288 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1289 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1290 if (RegisterVT.isInteger()) {
1291 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1292 } else if (RegisterVT.isVector()) {
1293 assert(!RegisterVT.getScalarType().isFloatingPoint());
1294 unsigned NumElements = RegisterVT.getVectorNumElements();
1295 assert(MemoryBits % NumElements == 0);
1296 // This vector type has been split into another vector type with
1297 // a different elements size.
1298 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1299 MemoryBits / NumElements);
1300 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1301 } else {
1302 llvm_unreachable("cannot deduce memory type.");
1303 }
1304 }
1305
1306 // Convert one element vectors to scalar.
1307 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1308 MemVT = MemVT.getScalarType();
1309
1310 // Round up vec3/vec5 argument.
1311 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1312 MemVT = MemVT.getPow2VectorType(State.getContext());
1313 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1314 MemVT = MemVT.getRoundIntegerType(State.getContext());
1315 }
1316
1317 unsigned PartOffset = 0;
1318 for (unsigned i = 0; i != NumRegs; ++i) {
1319 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1320 BasePartOffset + PartOffset,
1321 MemVT.getSimpleVT(),
1323 PartOffset += MemVT.getStoreSize();
1324 }
1325 }
1326 }
1327}
1328
1330 SDValue Chain, CallingConv::ID CallConv,
1331 bool isVarArg,
1333 const SmallVectorImpl<SDValue> &OutVals,
1334 const SDLoc &DL, SelectionDAG &DAG) const {
1335 // FIXME: Fails for r600 tests
1336 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1337 // "wave terminate should not have return values");
1338 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1339}
1340
1341//===---------------------------------------------------------------------===//
1342// Target specific lowering
1343//===---------------------------------------------------------------------===//
1344
1345/// Selects the correct CCAssignFn for a given CallingConvention value.
1350
1355
1357 SelectionDAG &DAG,
1358 MachineFrameInfo &MFI,
1359 int ClobberedFI) const {
1360 SmallVector<SDValue, 8> ArgChains;
1361 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1362 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1363
1364 // Include the original chain at the beginning of the list. When this is
1365 // used by target LowerCall hooks, this helps legalize find the
1366 // CALLSEQ_BEGIN node.
1367 ArgChains.push_back(Chain);
1368
1369 // Add a chain value for each stack argument corresponding
1370 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1371 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1372 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1373 if (FI->getIndex() < 0) {
1374 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1375 int64_t InLastByte = InFirstByte;
1376 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1377
1378 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1379 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1380 ArgChains.push_back(SDValue(L, 1));
1381 }
1382 }
1383 }
1384 }
1385
1386 // Build a tokenfactor for all the chains.
1387 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1388}
1389
1392 StringRef Reason) const {
1393 SDValue Callee = CLI.Callee;
1394 SelectionDAG &DAG = CLI.DAG;
1395
1396 const Function &Fn = DAG.getMachineFunction().getFunction();
1397
1398 StringRef FuncName("<unknown>");
1399
1401 FuncName = G->getSymbol();
1402 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1403 FuncName = G->getGlobal()->getName();
1404
1405 DAG.getContext()->diagnose(
1406 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1407
1408 if (!CLI.IsTailCall) {
1409 for (ISD::InputArg &Arg : CLI.Ins)
1410 InVals.push_back(DAG.getPOISON(Arg.VT));
1411 }
1412
1413 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1414 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1415 return CLI.Chain;
1416
1417 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1418 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1419}
1420
1422 SmallVectorImpl<SDValue> &InVals) const {
1423 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1424}
1425
1427 SelectionDAG &DAG) const {
1428 const Function &Fn = DAG.getMachineFunction().getFunction();
1429
1431 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1432 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1433 return DAG.getMergeValues(Ops, SDLoc());
1434}
1435
1437 SelectionDAG &DAG) const {
1438 switch (Op.getOpcode()) {
1439 default:
1440 Op->print(errs(), &DAG);
1441 llvm_unreachable("Custom lowering code for this "
1442 "instruction is not implemented yet!");
1443 break;
1445 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1447 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1448 case ISD::SDIVREM:
1449 return LowerSDIVREM(Op, DAG);
1450 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1451 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1452 case ISD::FRINT: return LowerFRINT(Op, DAG);
1453 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1454 case ISD::FROUNDEVEN:
1455 return LowerFROUNDEVEN(Op, DAG);
1456 case ISD::FROUND: return LowerFROUND(Op, DAG);
1457 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1458 case ISD::FLOG2:
1459 return LowerFLOG2(Op, DAG);
1460 case ISD::FLOG:
1461 case ISD::FLOG10:
1462 return LowerFLOGCommon(Op, DAG);
1463 case ISD::FEXP:
1464 case ISD::FEXP10:
1465 return lowerFEXP(Op, DAG);
1466 case ISD::FEXP2:
1467 return lowerFEXP2(Op, DAG);
1468 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1469 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1470 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1471 case ISD::FP_TO_SINT:
1472 case ISD::FP_TO_UINT:
1473 return LowerFP_TO_INT(Op, DAG);
1476 return LowerFP_TO_INT_SAT(Op, DAG);
1477 case ISD::CTTZ:
1479 case ISD::CTLZ:
1481 return LowerCTLZ_CTTZ(Op, DAG);
1482 case ISD::CTLS:
1483 return LowerCTLS(Op, DAG);
1485 }
1486 return Op;
1487}
1488
1491 SelectionDAG &DAG) const {
1492 switch (N->getOpcode()) {
1494 // Different parts of legalization seem to interpret which type of
1495 // sign_extend_inreg is the one to check for custom lowering. The extended
1496 // from type is what really matters, but some places check for custom
1497 // lowering of the result type. This results in trying to use
1498 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1499 // nothing here and let the illegal result integer be handled normally.
1500 return;
1501 case ISD::FLOG2:
1502 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1503 Results.push_back(Lowered);
1504 return;
1505 case ISD::FLOG:
1506 case ISD::FLOG10:
1507 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1508 Results.push_back(Lowered);
1509 return;
1510 case ISD::FEXP2:
1511 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1512 Results.push_back(Lowered);
1513 return;
1514 case ISD::FEXP:
1515 case ISD::FEXP10:
1516 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1517 Results.push_back(Lowered);
1518 return;
1519 case ISD::CTLZ:
1521 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1522 Results.push_back(Lowered);
1523 return;
1524 default:
1525 return;
1526 }
1527}
1528
1530 SelectionDAG &DAG) const {
1532 SDLoc SL(Op);
1533 EVT VT = Op.getValueType();
1534 return DAG.getTargetBlockAddress(BA->getBlockAddress(), VT, BA->getOffset(),
1535 BA->getTargetFlags());
1536}
1537
1539 SDValue Op,
1540 SelectionDAG &DAG) const {
1541
1542 const DataLayout &DL = DAG.getDataLayout();
1544 const GlobalValue *GV = G->getGlobal();
1545
1546 if (!MFI->isModuleEntryFunction()) {
1547 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1548 if (std::optional<uint32_t> Address =
1550 if (IsNamedBarrier) {
1551 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1552 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1553 }
1554 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1555 } else if (IsNamedBarrier) {
1556 llvm_unreachable("named barrier should have an assigned address");
1557 }
1558 }
1559
1560 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1561 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1562 if (!MFI->isModuleEntryFunction() &&
1563 GV->getName() != "llvm.amdgcn.module.lds" &&
1565 SDLoc DL(Op);
1566 const Function &Fn = DAG.getMachineFunction().getFunction();
1568 Fn, "local memory global used by non-kernel function",
1569 DL.getDebugLoc(), DS_Warning));
1570
1571 // We currently don't have a way to correctly allocate LDS objects that
1572 // aren't directly associated with a kernel. We do force inlining of
1573 // functions that use local objects. However, if these dead functions are
1574 // not eliminated, we don't want a compile time error. Just emit a warning
1575 // and a trap, since there should be no callable path here.
1576 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1577 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1578 Trap, DAG.getRoot());
1579 DAG.setRoot(OutputChain);
1580 return DAG.getPOISON(Op.getValueType());
1581 }
1582
1583 // XXX: What does the value of G->getOffset() mean?
1584 assert(G->getOffset() == 0 &&
1585 "Do not know what to do with an non-zero offset");
1586
1587 // TODO: We could emit code to handle the initialization somewhere.
1588 // We ignore the initializer for now and legalize it to allow selection.
1589 // The initializer will anyway get errored out during assembly emission.
1590 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1591 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1592 }
1593 return SDValue();
1594}
1595
1597 SelectionDAG &DAG) const {
1599 SDLoc SL(Op);
1600
1601 EVT VT = Op.getValueType();
1602 if (VT.getVectorElementType().getSizeInBits() < 32) {
1603 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1604 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1605 unsigned NewNumElt = OpBitSize / 32;
1606 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1608 MVT::i32, NewNumElt);
1609 for (const SDUse &U : Op->ops()) {
1610 SDValue In = U.get();
1611 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1612 if (NewNumElt > 1)
1613 DAG.ExtractVectorElements(NewIn, Args);
1614 else
1615 Args.push_back(NewIn);
1616 }
1617
1618 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1619 NewNumElt * Op.getNumOperands());
1620 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1621 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1622 }
1623 }
1624
1625 for (const SDUse &U : Op->ops())
1626 DAG.ExtractVectorElements(U.get(), Args);
1627
1628 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1629}
1630
1632 SelectionDAG &DAG) const {
1633 SDLoc SL(Op);
1635 unsigned Start = Op.getConstantOperandVal(1);
1636 EVT VT = Op.getValueType();
1637 EVT SrcVT = Op.getOperand(0).getValueType();
1638
1639 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1640 unsigned NumElt = VT.getVectorNumElements();
1641 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1642 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1643
1644 // Extract 32-bit registers at a time.
1645 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1646 EVT NewVT = NumElt == 2
1647 ? MVT::i32
1648 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1649 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1650
1651 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1652 if (NumElt == 2)
1653 Tmp = Args[0];
1654 else
1655 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1656
1657 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1658 }
1659
1660 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1662
1663 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1664}
1665
1666// TODO: Handle fabs too
1668 if (Val.getOpcode() == ISD::FNEG)
1669 return Val.getOperand(0);
1670
1671 return Val;
1672}
1673
1675 if (Val.getOpcode() == ISD::FNEG)
1676 Val = Val.getOperand(0);
1677 if (Val.getOpcode() == ISD::FABS)
1678 Val = Val.getOperand(0);
1679 if (Val.getOpcode() == ISD::FCOPYSIGN)
1680 Val = Val.getOperand(0);
1681 return Val;
1682}
1683
1685 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1686 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1687 SelectionDAG &DAG = DCI.DAG;
1688 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1689 switch (CCOpcode) {
1690 case ISD::SETOEQ:
1691 case ISD::SETONE:
1692 case ISD::SETUNE:
1693 case ISD::SETNE:
1694 case ISD::SETUEQ:
1695 case ISD::SETEQ:
1696 case ISD::SETFALSE:
1697 case ISD::SETFALSE2:
1698 case ISD::SETTRUE:
1699 case ISD::SETTRUE2:
1700 case ISD::SETUO:
1701 case ISD::SETO:
1702 break;
1703 case ISD::SETULE:
1704 case ISD::SETULT: {
1705 if (LHS == True)
1706 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1707 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1708 }
1709 case ISD::SETOLE:
1710 case ISD::SETOLT:
1711 case ISD::SETLE:
1712 case ISD::SETLT: {
1713 // Ordered. Assume ordered for undefined.
1714
1715 // Only do this after legalization to avoid interfering with other combines
1716 // which might occur.
1718 !DCI.isCalledByLegalizer())
1719 return SDValue();
1720
1721 // We need to permute the operands to get the correct NaN behavior. The
1722 // selected operand is the second one based on the failing compare with NaN,
1723 // so permute it based on the compare type the hardware uses.
1724 if (LHS == True)
1725 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1726 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1727 }
1728 case ISD::SETUGE:
1729 case ISD::SETUGT: {
1730 if (LHS == True)
1731 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1732 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1733 }
1734 case ISD::SETGT:
1735 case ISD::SETGE:
1736 case ISD::SETOGE:
1737 case ISD::SETOGT: {
1739 !DCI.isCalledByLegalizer())
1740 return SDValue();
1741
1742 if (LHS == True)
1743 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1744 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1745 }
1746 case ISD::SETCC_INVALID:
1747 llvm_unreachable("Invalid setcc condcode!");
1748 }
1749 return SDValue();
1750}
1751
1752/// Generate Min/Max node
1754 SDValue LHS, SDValue RHS,
1755 SDValue True, SDValue False,
1756 SDValue CC,
1757 DAGCombinerInfo &DCI) const {
1758 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1759 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1760
1761 SelectionDAG &DAG = DCI.DAG;
1762
1763 // If we can't directly match this, try to see if we can fold an fneg to
1764 // match.
1765
1768 SDValue NegTrue = peekFNeg(True);
1769
1770 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1771 // fmin/fmax.
1772 //
1773 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1774 // -> fneg (fmin_legacy lhs, K)
1775 //
1776 // TODO: Use getNegatedExpression
1777 if (LHS == NegTrue && CFalse && CRHS) {
1778 APFloat NegRHS = neg(CRHS->getValueAPF());
1779 if (NegRHS == CFalse->getValueAPF()) {
1780 SDValue Combined =
1781 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1782 if (Combined)
1783 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1784 return SDValue();
1785 }
1786 }
1787
1788 return SDValue();
1789}
1790
1791std::pair<SDValue, SDValue>
1793 SDLoc SL(Op);
1794
1795 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1796
1797 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1798 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1799
1800 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1801 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1802
1803 return std::pair(Lo, Hi);
1804}
1805
1807 SDLoc SL(Op);
1808
1809 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1810 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1811 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1812}
1813
1815 SDLoc SL(Op);
1816
1817 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1818 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1819 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1820}
1821
1822// Split a vector type into two parts. The first part is a power of two vector.
1823// The second part is whatever is left over, and is a scalar if it would
1824// otherwise be a 1-vector.
1825std::pair<EVT, EVT>
1827 EVT LoVT, HiVT;
1828 EVT EltVT = VT.getVectorElementType();
1829 unsigned NumElts = VT.getVectorNumElements();
1830 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1831 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1832 HiVT = NumElts - LoNumElts == 1
1833 ? EltVT
1834 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1835 return std::pair(LoVT, HiVT);
1836}
1837
1838// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1839// scalar.
1840std::pair<SDValue, SDValue>
1842 const EVT &LoVT, const EVT &HiVT,
1843 SelectionDAG &DAG) const {
1844 EVT VT = N.getValueType();
1846 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1847 VT.getVectorNumElements() &&
1848 "More vector elements requested than available!");
1850 DAG.getVectorIdxConstant(0, DL));
1851
1852 unsigned LoNumElts = LoVT.getVectorNumElements();
1853
1854 if (HiVT.isVector()) {
1855 unsigned HiNumElts = HiVT.getVectorNumElements();
1856 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1857 // Avoid creating an extract_subvector with an index that isn't a multiple
1858 // of the result type.
1860 DAG.getConstant(LoNumElts, DL, MVT::i32));
1861 return {Lo, Hi};
1862 }
1863
1865 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1866 /*Count=*/HiNumElts);
1867 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1868 return {Lo, Hi};
1869 }
1870
1872 DAG.getVectorIdxConstant(LoNumElts, DL));
1873 return {Lo, Hi};
1874}
1875
1877 SelectionDAG &DAG) const {
1879 EVT VT = Op.getValueType();
1880 SDLoc SL(Op);
1881
1882
1883 // If this is a 2 element vector, we really want to scalarize and not create
1884 // weird 1 element vectors.
1885 if (VT.getVectorNumElements() == 2) {
1886 SDValue Ops[2];
1887 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1888 return DAG.getMergeValues(Ops, SL);
1889 }
1890
1891 SDValue BasePtr = Load->getBasePtr();
1892 EVT MemVT = Load->getMemoryVT();
1893
1894 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1895
1896 EVT LoVT, HiVT;
1897 EVT LoMemVT, HiMemVT;
1898 SDValue Lo, Hi;
1899
1900 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1901 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1902 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1903
1904 unsigned Size = LoMemVT.getStoreSize();
1905 Align BaseAlign = Load->getAlign();
1906 Align HiAlign = commonAlignment(BaseAlign, Size);
1907
1908 SDValue LoLoad = DAG.getExtLoad(
1909 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1910 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1911 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1912 SDValue HiLoad = DAG.getExtLoad(
1913 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1914 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1915 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1916
1917 SDValue Join;
1918 if (LoVT == HiVT) {
1919 // This is the case that the vector is power of two so was evenly split.
1920 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1921 } else {
1922 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1923 DAG.getVectorIdxConstant(0, SL));
1924 Join = DAG.getNode(
1926 VT, Join, HiLoad,
1928 }
1929
1930 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1931 LoLoad.getValue(1), HiLoad.getValue(1))};
1932
1933 return DAG.getMergeValues(Ops, SL);
1934}
1935
1937 SelectionDAG &DAG) const {
1939 EVT VT = Op.getValueType();
1940 SDValue BasePtr = Load->getBasePtr();
1941 EVT MemVT = Load->getMemoryVT();
1942 SDLoc SL(Op);
1943 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1944 Align BaseAlign = Load->getAlign();
1945 unsigned NumElements = MemVT.getVectorNumElements();
1946
1947 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1948 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1949 if (NumElements != 3 ||
1950 (BaseAlign < Align(8) &&
1951 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1952 return SplitVectorLoad(Op, DAG);
1953
1954 assert(NumElements == 3);
1955
1956 EVT WideVT =
1958 EVT WideMemVT =
1960 SDValue WideLoad = DAG.getExtLoad(
1961 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1962 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1963 return DAG.getMergeValues(
1964 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1965 DAG.getVectorIdxConstant(0, SL)),
1966 WideLoad.getValue(1)},
1967 SL);
1968}
1969
1971 SelectionDAG &DAG) const {
1973 SDValue Val = Store->getValue();
1974 EVT VT = Val.getValueType();
1975
1976 // If this is a 2 element vector, we really want to scalarize and not create
1977 // weird 1 element vectors.
1978 if (VT.getVectorNumElements() == 2)
1979 return scalarizeVectorStore(Store, DAG);
1980
1981 EVT MemVT = Store->getMemoryVT();
1982 SDValue Chain = Store->getChain();
1983 SDValue BasePtr = Store->getBasePtr();
1984 SDLoc SL(Op);
1985
1986 EVT LoVT, HiVT;
1987 EVT LoMemVT, HiMemVT;
1988 SDValue Lo, Hi;
1989
1990 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1991 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1992 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1993
1994 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1995
1996 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1997 Align BaseAlign = Store->getAlign();
1998 unsigned Size = LoMemVT.getStoreSize();
1999 Align HiAlign = commonAlignment(BaseAlign, Size);
2000
2001 SDValue LoStore =
2002 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
2003 Store->getMemOperand()->getFlags(), Store->getAAInfo());
2004 SDValue HiStore = DAG.getTruncStore(
2005 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
2006 Store->getMemOperand()->getFlags(), Store->getAAInfo());
2007
2008 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
2009}
2010
2011// This is a shortcut for integer division because we have fast i32<->f32
2012// conversions, and fast f32 reciprocal instructions. The fractional part of a
2013// float is enough to accurately represent up to a 24-bit integer.
2015 bool Sign) const {
2016 SDLoc DL(Op);
2017 EVT VT = Op.getValueType();
2018 assert(VT == MVT::i32 && "LowerDIVREM24 expects an i32");
2019
2020 SDValue LHS = Op.getOperand(0);
2021 SDValue RHS = Op.getOperand(1);
2022 MVT IntVT = MVT::i32;
2023 MVT FltVT = MVT::f32;
2024
2025 unsigned LHSSignBits;
2026 unsigned RHSSignBits;
2027 if (Sign) {
2028 LHSSignBits = DAG.ComputeNumSignBits(LHS);
2029 RHSSignBits = DAG.ComputeNumSignBits(RHS);
2030 if (LHSSignBits < 9 || RHSSignBits < 9)
2031 return SDValue();
2032 } else {
2033 KnownBits LHSKnown = DAG.computeKnownBits(LHS);
2034 KnownBits RHSKnown = DAG.computeKnownBits(RHS);
2035 APInt U24Max = APInt::getLowBitsSet(32, 24);
2036 if (LHSKnown.getMaxValue().ugt(U24Max) ||
2037 RHSKnown.getMaxValue().ugt(U24Max))
2038 return SDValue();
2039 LHSSignBits = LHSKnown.countMinLeadingZeros();
2040 RHSSignBits = RHSKnown.countMinLeadingZeros();
2041 }
2042
2043 unsigned BitSize = VT.getSizeInBits();
2044 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2045 unsigned DivBits = BitSize - SignBits;
2046 if (Sign)
2047 ++DivBits;
2048
2051
2052 SDValue jq = DAG.getConstant(1, DL, IntVT);
2053
2054 if (Sign) {
2055 // char|short jq = ia ^ ib;
2056 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2057
2058 // jq = jq >> (bitsize - 2)
2059 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2060 DAG.getConstant(BitSize - 2, DL, VT));
2061
2062 // jq = jq | 0x1
2063 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2064 }
2065
2066 // int ia = (int)LHS;
2067 SDValue ia = LHS;
2068
2069 // int ib, (int)RHS;
2070 SDValue ib = RHS;
2071
2072 // float fa = (float)ia;
2073 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2074
2075 // float fb = (float)ib;
2076 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2077
2078 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2079 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2080
2081 // fq = trunc(fq);
2082 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2083
2084 // float fqneg = -fq;
2085 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2086
2088
2089 bool UseFmadFtz = false;
2090 if (Subtarget->isGCN()) {
2092 UseFmadFtz =
2094 }
2095
2096 // float fr = mad(fqneg, fb, fa);
2097 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2098 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2100 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2101
2102 // int iq = (int)fq;
2103 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2104
2105 // fr = fabs(fr);
2106 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2107
2108 // fb = fabs(fb);
2109 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2110
2111 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2112
2113 // int cv = fr >= fb;
2114 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2115
2116 // jq = (cv ? jq : 0);
2117 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2118
2119 // dst = iq + jq;
2120 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2121
2122 // Rem needs compensation, it's easier to recompute it
2123 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2124 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2125
2126 // Truncate to number of bits this divide really is.
2127 if (Sign) {
2128 SDValue InRegSize
2129 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2130 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2131 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2132 } else {
2133 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2134 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2135 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2136 }
2137
2138 return DAG.getMergeValues({ Div, Rem }, DL);
2139}
2140
2142 SelectionDAG &DAG,
2144 SDLoc DL(Op);
2145 EVT VT = Op.getValueType();
2146
2147 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2148
2149 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2150
2151 SDValue One = DAG.getConstant(1, DL, HalfVT);
2152 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2153
2154 //HiLo split
2155 SDValue LHS_Lo, LHS_Hi;
2156 SDValue LHS = Op.getOperand(0);
2157 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2158
2159 SDValue RHS_Lo, RHS_Hi;
2160 SDValue RHS = Op.getOperand(1);
2161 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2162
2163 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2164 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2165
2166 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2167 LHS_Lo, RHS_Lo);
2168
2169 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2170 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2171
2172 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2173 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2174 return;
2175 }
2176
2177 if (isTypeLegal(MVT::i64)) {
2178 // The algorithm here is based on ideas from "Software Integer Division",
2179 // Tom Rodeheffer, August 2008.
2180
2183
2184 // Compute denominator reciprocal.
2185 unsigned FMAD =
2186 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2189 : (unsigned)AMDGPUISD::FMAD_FTZ;
2190
2191 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2192 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2193 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2194 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2195 Cvt_Lo);
2196 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2197 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2198 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2199 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2200 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2201 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2202 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2203 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2204 Mul1);
2205 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2206 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2207 SDValue Rcp64 = DAG.getBitcast(VT,
2208 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2209
2210 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2211 SDValue One64 = DAG.getConstant(1, DL, VT);
2212 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2213 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2214
2215 // First round of UNR (Unsigned integer Newton-Raphson).
2216 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2217 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2218 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2219 SDValue Mulhi1_Lo, Mulhi1_Hi;
2220 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2221 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2222 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2223 Mulhi1_Lo, Zero1);
2224 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2225 Mulhi1_Hi, Add1_Lo.getValue(1));
2226 SDValue Add1 = DAG.getBitcast(VT,
2227 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2228
2229 // Second round of UNR.
2230 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2231 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2232 SDValue Mulhi2_Lo, Mulhi2_Hi;
2233 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2234 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2235 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2236 Mulhi2_Lo, Zero1);
2237 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2238 Mulhi2_Hi, Add2_Lo.getValue(1));
2239 SDValue Add2 = DAG.getBitcast(VT,
2240 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2241
2242 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2243
2244 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2245
2246 SDValue Mul3_Lo, Mul3_Hi;
2247 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2248 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2249 Mul3_Lo, Zero1);
2250 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2251 Mul3_Hi, Sub1_Lo.getValue(1));
2252 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2253 SDValue Sub1 = DAG.getBitcast(VT,
2254 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2255
2256 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2257 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2258 ISD::SETUGE);
2259 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2260 ISD::SETUGE);
2261 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2262
2263 // TODO: Here and below portions of the code can be enclosed into if/endif.
2264 // Currently control flow is unconditional and we have 4 selects after
2265 // potential endif to substitute PHIs.
2266
2267 // if C3 != 0 ...
2268 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2269 RHS_Lo, Zero1);
2270 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2271 RHS_Hi, Sub1_Lo.getValue(1));
2272 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2273 Zero, Sub2_Lo.getValue(1));
2274 SDValue Sub2 = DAG.getBitcast(VT,
2275 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2276
2277 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2278
2279 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2280 ISD::SETUGE);
2281 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2282 ISD::SETUGE);
2283 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2284
2285 // if (C6 != 0)
2286 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2287
2288 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2289 RHS_Lo, Zero1);
2290 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2291 RHS_Hi, Sub2_Lo.getValue(1));
2292 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2293 Zero, Sub3_Lo.getValue(1));
2294 SDValue Sub3 = DAG.getBitcast(VT,
2295 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2296
2297 // endif C6
2298 // endif C3
2299
2300 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2301 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2302
2303 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2304 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2305
2306 Results.push_back(Div);
2307 Results.push_back(Rem);
2308
2309 return;
2310 }
2311
2312 // r600 expandion.
2313 // Get Speculative values
2314 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2315 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2316
2317 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2318 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2319 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2320
2321 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2322 SDValue DIV_Lo = Zero;
2323
2324 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2325
2326 for (unsigned i = 0; i < halfBitWidth; ++i) {
2327 const unsigned bitPos = halfBitWidth - i - 1;
2328 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2329 // Get value of high bit
2330 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2331 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2332 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2333
2334 // Shift
2335 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2336 // Add LHS high bit
2337 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2338
2339 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2340 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2341
2342 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2343
2344 // Update REM
2345 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2346 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2347 }
2348
2349 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2350 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2351 Results.push_back(DIV);
2352 Results.push_back(REM);
2353}
2354
2356 SelectionDAG &DAG) const {
2357 SDLoc DL(Op);
2358 EVT VT = Op.getValueType();
2359
2360 if (VT == MVT::i64) {
2362 LowerUDIVREM64(Op, DAG, Results);
2363 return DAG.getMergeValues(Results, DL);
2364 }
2365
2366 if (VT == MVT::i32) {
2367 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2368 return Res;
2369 }
2370
2371 SDValue X = Op.getOperand(0);
2372 SDValue Y = Op.getOperand(1);
2373
2374 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2375 // algorithm used here.
2376
2377 // Initial estimate of inv(y).
2378 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2379
2380 // One round of UNR.
2381 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2382 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2383 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2384 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2385
2386 // Quotient/remainder estimate.
2387 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2388 SDValue R =
2389 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2390
2391 // First quotient/remainder refinement.
2392 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2393 SDValue One = DAG.getConstant(1, DL, VT);
2394 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2395 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2396 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2397 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2398 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2399
2400 // Second quotient/remainder refinement.
2401 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2402 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2403 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2404 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2405 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2406
2407 return DAG.getMergeValues({Q, R}, DL);
2408}
2409
2411 SelectionDAG &DAG) const {
2412 SDLoc DL(Op);
2413 EVT VT = Op.getValueType();
2414
2415 SDValue LHS = Op.getOperand(0);
2416 SDValue RHS = Op.getOperand(1);
2417
2418 SDValue Zero = DAG.getConstant(0, DL, VT);
2419 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2420
2421 if (VT == MVT::i32) {
2422 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2423 return Res;
2424 }
2425
2426 // LHS must have > 33 sign-bits to ensure that LHS != -2147483648
2427 // Otherwise 32-bit division cannot be used safely.
2428 // -2147483648/1 and -2147483648/-1 are not equal,
2429 // but they produce the same lower 32-bit result.
2430 if (VT == MVT::i64 && DAG.ComputeNumSignBits(LHS) > 33 &&
2431 DAG.ComputeNumSignBits(RHS) > 32) {
2432 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2433
2434 //HiLo split
2435 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2436 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2437 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2438 LHS_Lo, RHS_Lo);
2439 SDValue Res[2] = {
2440 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2441 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2442 };
2443 return DAG.getMergeValues(Res, DL);
2444 }
2445
2446 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2447 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2448 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2449 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2450
2451 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2452 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2453
2454 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2455 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2456
2457 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2458 SDValue Rem = Div.getValue(1);
2459
2460 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2461 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2462
2463 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2464 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2465
2466 SDValue Res[2] = {
2467 Div,
2468 Rem
2469 };
2470 return DAG.getMergeValues(Res, DL);
2471}
2472
2474 SDLoc SL(Op);
2475 SDValue Src = Op.getOperand(0);
2476
2477 // result = trunc(src)
2478 // if (src > 0.0 && src != result)
2479 // result += 1.0
2480
2481 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2482
2483 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2484 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2485
2486 EVT SetCCVT =
2487 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2488
2489 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2490 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2491 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2492
2493 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2494 // TODO: Should this propagate fast-math-flags?
2495 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2496}
2497
2499 SelectionDAG &DAG) {
2500 const unsigned FractBits = 52;
2501 const unsigned ExpBits = 11;
2502
2503 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2504 Hi,
2505 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2506 DAG.getConstant(ExpBits, SL, MVT::i32));
2507 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2508 DAG.getConstant(1023, SL, MVT::i32));
2509
2510 return Exp;
2511}
2512
2514 SDLoc SL(Op);
2515 SDValue Src = Op.getOperand(0);
2516
2517 assert(Op.getValueType() == MVT::f64);
2518
2519 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2520
2521 // Extract the upper half, since this is where we will find the sign and
2522 // exponent.
2523 SDValue Hi = getHiHalf64(Src, DAG);
2524
2525 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2526
2527 const unsigned FractBits = 52;
2528
2529 // Extract the sign bit.
2530 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2531 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2532
2533 // Extend back to 64-bits.
2534 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2535 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2536
2537 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2538 const SDValue FractMask
2539 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2540
2541 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2542 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2543 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2544
2545 EVT SetCCVT =
2546 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2547
2548 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2549
2550 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2551 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2552
2553 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2554 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2555
2556 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2557}
2558
2560 SelectionDAG &DAG) const {
2561 SDLoc SL(Op);
2562 SDValue Src = Op.getOperand(0);
2563
2564 assert(Op.getValueType() == MVT::f64);
2565
2566 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2567 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2568 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2569
2570 // TODO: Should this propagate fast-math-flags?
2571
2572 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2573 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2574
2575 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2576
2577 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2578 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2579
2580 EVT SetCCVT =
2581 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2582 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2583
2584 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2585}
2586
2588 SelectionDAG &DAG) const {
2589 // FNEARBYINT and FRINT are the same, except in their handling of FP
2590 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2591 // rint, so just treat them as equivalent.
2592 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2593 Op.getOperand(0));
2594}
2595
2597 auto VT = Op.getValueType();
2598 auto Arg = Op.getOperand(0u);
2599 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2600}
2601
2602// XXX - May require not supporting f32 denormals?
2603
2604// Don't handle v2f16. The extra instructions to scalarize and repack around the
2605// compare and vselect end up producing worse code than scalarizing the whole
2606// operation.
2608 SDLoc SL(Op);
2609 SDValue X = Op.getOperand(0);
2610 EVT VT = Op.getValueType();
2611
2612 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2613
2614 // TODO: Should this propagate fast-math-flags?
2615
2616 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2617
2618 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2619
2620 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2621 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2622
2623 EVT SetCCVT =
2624 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2625
2626 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2627 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2628 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2629
2630 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2631 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2632}
2633
2635 SDLoc SL(Op);
2636 SDValue Src = Op.getOperand(0);
2637
2638 // result = trunc(src);
2639 // if (src < 0.0 && src != result)
2640 // result += -1.0.
2641
2642 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2643
2644 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2645 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2646
2647 EVT SetCCVT =
2648 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2649
2650 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2651 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2652 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2653
2654 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2655 // TODO: Should this propagate fast-math-flags?
2656 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2657}
2658
2659/// Return true if it's known that \p Src can never be an f32 denormal value.
2661 switch (Src.getOpcode()) {
2662 case ISD::FP_EXTEND:
2663 return Src.getOperand(0).getValueType() == MVT::f16;
2664 case ISD::FP16_TO_FP:
2665 case ISD::FFREXP:
2666 case ISD::FSQRT:
2667 case AMDGPUISD::LOG:
2668 case AMDGPUISD::EXP:
2669 return true;
2671 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2672 switch (IntrinsicID) {
2673 case Intrinsic::amdgcn_frexp_mant:
2674 case Intrinsic::amdgcn_log:
2675 case Intrinsic::amdgcn_log_clamp:
2676 case Intrinsic::amdgcn_exp2:
2677 case Intrinsic::amdgcn_sqrt:
2678 return true;
2679 default:
2680 return false;
2681 }
2682 }
2683 default:
2684 return false;
2685 }
2686
2687 llvm_unreachable("covered opcode switch");
2688}
2689
2691 SDNodeFlags Flags) {
2692 return Flags.hasApproximateFuncs();
2693}
2694
2703
2705 SDValue Src,
2706 SDNodeFlags Flags) const {
2707 SDLoc SL(Src);
2708 EVT VT = Src.getValueType();
2709 const fltSemantics &Semantics = VT.getFltSemantics();
2710 SDValue SmallestNormal =
2711 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2712
2713 // Want to scale denormals up, but negatives and 0 work just as well on the
2714 // scaled path.
2715 SDValue IsLtSmallestNormal = DAG.getSetCC(
2716 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2717 SmallestNormal, ISD::SETOLT);
2718
2719 return IsLtSmallestNormal;
2720}
2721
2723 SDNodeFlags Flags) const {
2724 SDLoc SL(Src);
2725 EVT VT = Src.getValueType();
2726 const fltSemantics &Semantics = VT.getFltSemantics();
2727 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2728
2729 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2730 SDValue IsFinite = DAG.getSetCC(
2731 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2732 Inf, ISD::SETOLT);
2733 return IsFinite;
2734}
2735
2736/// If denormal handling is required return the scaled input to FLOG2, and the
2737/// check for denormal range. Otherwise, return null values.
2738std::pair<SDValue, SDValue>
2740 SDValue Src, SDNodeFlags Flags) const {
2741 if (!needsDenormHandlingF32(DAG, Src, Flags))
2742 return {};
2743
2744 MVT VT = MVT::f32;
2745 const fltSemantics &Semantics = APFloat::IEEEsingle();
2746 SDValue SmallestNormal =
2747 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2748
2749 SDValue IsLtSmallestNormal = DAG.getSetCC(
2750 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2751 SmallestNormal, ISD::SETOLT);
2752
2753 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2754 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2755 SDValue ScaleFactor =
2756 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2757
2758 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2759 return {ScaledInput, IsLtSmallestNormal};
2760}
2761
2763 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2764 // If we have to handle denormals, scale up the input and adjust the result.
2765
2766 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2767 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2768
2769 SDLoc SL(Op);
2770 EVT VT = Op.getValueType();
2771 SDValue Src = Op.getOperand(0);
2772 SDNodeFlags Flags = Op->getFlags();
2773
2774 if (VT == MVT::f16) {
2775 // Nothing in half is a denormal when promoted to f32.
2776 assert(!isTypeLegal(VT));
2777 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2778 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2779 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2780 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2781 }
2782
2783 auto [ScaledInput, IsLtSmallestNormal] =
2784 getScaledLogInput(DAG, SL, Src, Flags);
2785 if (!ScaledInput)
2786 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2787
2788 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2789
2790 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2791 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2792 SDValue ResultOffset =
2793 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2794 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2795}
2796
2797static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2798 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2799 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2800 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2801}
2802
2804 SelectionDAG &DAG) const {
2805 SDValue X = Op.getOperand(0);
2806 EVT VT = Op.getValueType();
2807 SDNodeFlags Flags = Op->getFlags();
2808 SDLoc DL(Op);
2809 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2810 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2811
2812 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2813 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2814 // depending on !fpmath metadata.
2815
2816 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2817 !isTypeLegal(MVT::f16));
2818
2819 if (PromoteToF32) {
2820 // Log and multiply in f32 is always good enough for f16.
2821 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2822 }
2823
2824 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2825 if (PromoteToF32) {
2826 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2827 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2828 }
2829
2830 return Lowered;
2831 }
2832
2833 SDValue ScaledInput, IsScaled;
2834 if (VT == MVT::f16)
2835 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2836 else {
2837 std::tie(ScaledInput, IsScaled) = getScaledLogInput(DAG, DL, X, Flags);
2838 if (ScaledInput)
2839 X = ScaledInput;
2840 }
2841
2842 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2843
2844 SDValue R;
2845 if (Subtarget->hasFastFMAF32()) {
2846 // c+cc are ln(2)/ln(10) to more than 49 bits
2847 const float c_log10 = 0x1.344134p-2f;
2848 const float cc_log10 = 0x1.09f79ep-26f;
2849
2850 // c + cc is ln(2) to more than 49 bits
2851 const float c_log = 0x1.62e42ep-1f;
2852 const float cc_log = 0x1.efa39ep-25f;
2853
2854 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2855 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2856 // This adds correction terms for which contraction may lead to an increase
2857 // in the error of the approximation, so disable it.
2858 Flags.setAllowContract(false);
2859 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2860 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2861 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2862 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2863 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2864 } else {
2865 // ch+ct is ln(2)/ln(10) to more than 36 bits
2866 const float ch_log10 = 0x1.344000p-2f;
2867 const float ct_log10 = 0x1.3509f6p-18f;
2868
2869 // ch + ct is ln(2) to more than 36 bits
2870 const float ch_log = 0x1.62e000p-1f;
2871 const float ct_log = 0x1.0bfbe8p-15f;
2872
2873 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2874 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2875
2876 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2877 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2878 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2879 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2880 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2881 // This adds correction terms for which contraction may lead to an increase
2882 // in the error of the approximation, so disable it.
2883 Flags.setAllowContract(false);
2884 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2885 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2886 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2887 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2888 }
2889
2890 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2891
2892 // TODO: Check if known finite from source value.
2893 if (!IsFiniteOnly) {
2894 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2895 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2896 }
2897
2898 if (IsScaled) {
2899 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2900 SDValue ShiftK =
2901 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2902 SDValue Shift =
2903 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2904 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2905 }
2906
2907 return R;
2908}
2909
2913
2914// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2915// promote f16 operation.
2917 SelectionDAG &DAG, bool IsLog10,
2918 SDNodeFlags Flags) const {
2919 EVT VT = Src.getValueType();
2920 unsigned LogOp =
2921 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2922
2923 double Log2BaseInverted =
2925
2926 if (VT == MVT::f32) {
2927 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2928 if (ScaledInput) {
2929 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2930 SDValue ScaledResultOffset =
2931 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2932
2933 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2934
2935 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2936 ScaledResultOffset, Zero, Flags);
2937
2938 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2939
2940 if (Subtarget->hasFastFMAF32())
2941 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2942 Flags);
2943 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2944 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2945 }
2946 }
2947
2948 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2949 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2950
2951 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2952 Flags);
2953}
2954
2955// This expansion gives a result slightly better than 1ulp.
2957 SelectionDAG &DAG) const {
2958 SDLoc DL(Op);
2959 SDValue X = Op.getOperand(0);
2960
2961 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2962 // exp10, which slightly increases ulp.
2963 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2964
2965 SDValue DN, F, T;
2966
2967 if (Op.getOpcode() == ISD::FEXP2) {
2968 // dn = rint(x)
2969 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, X, Flags);
2970 // f = x - dn
2971 F = DAG.getNode(ISD::FSUB, DL, MVT::f64, X, DN, Flags);
2972 // t = f*C1 + f*C2
2973 SDValue C1 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2974 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2975 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C2, Flags);
2976 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C1, Mul2, Flags);
2977 } else if (Op.getOpcode() == ISD::FEXP10) {
2978 // dn = rint(x * C1)
2979 SDValue C1 = DAG.getConstantFP(0x1.a934f0979a371p+1, DL, MVT::f64);
2980 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2981 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2982
2983 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2984 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2985 SDValue C2 = DAG.getConstantFP(-0x1.9dc1da994fd21p-59, DL, MVT::f64);
2986 SDValue C3 = DAG.getConstantFP(0x1.34413509f79ffp-2, DL, MVT::f64);
2987 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2988 F = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2989
2990 // t = FMA(f, C4, f*C5)
2991 SDValue C4 = DAG.getConstantFP(0x1.26bb1bbb55516p+1, DL, MVT::f64);
2992 SDValue C5 = DAG.getConstantFP(-0x1.f48ad494ea3e9p-53, DL, MVT::f64);
2993 SDValue MulF = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C5, Flags);
2994 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C4, MulF, Flags);
2995 } else { // ISD::FEXP
2996 // dn = rint(x * C1)
2997 SDValue C1 = DAG.getConstantFP(0x1.71547652b82fep+0, DL, MVT::f64);
2998 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2999 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
3000
3001 // t = FMA(-dn, C2, FMA(-dn, C3, x))
3002 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
3003 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
3004 SDValue C3 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
3005 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
3006 T = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
3007 }
3008
3009 // Polynomial expansion for p
3010 SDValue P = DAG.getConstantFP(0x1.ade156a5dcb37p-26, DL, MVT::f64);
3011 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3012 DAG.getConstantFP(0x1.28af3fca7ab0cp-22, DL, MVT::f64),
3013 Flags);
3014 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3015 DAG.getConstantFP(0x1.71dee623fde64p-19, DL, MVT::f64),
3016 Flags);
3017 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3018 DAG.getConstantFP(0x1.a01997c89e6b0p-16, DL, MVT::f64),
3019 Flags);
3020 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3021 DAG.getConstantFP(0x1.a01a014761f6ep-13, DL, MVT::f64),
3022 Flags);
3023 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3024 DAG.getConstantFP(0x1.6c16c1852b7b0p-10, DL, MVT::f64),
3025 Flags);
3026 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3027 DAG.getConstantFP(0x1.1111111122322p-7, DL, MVT::f64), Flags);
3028 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3029 DAG.getConstantFP(0x1.55555555502a1p-5, DL, MVT::f64), Flags);
3030 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3031 DAG.getConstantFP(0x1.5555555555511p-3, DL, MVT::f64), Flags);
3032 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3033 DAG.getConstantFP(0x1.000000000000bp-1, DL, MVT::f64), Flags);
3034
3035 SDValue One = DAG.getConstantFP(1.0, DL, MVT::f64);
3036
3037 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3038 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3039
3040 // z = ldexp(p, (int)dn)
3041 SDValue DNInt = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32, DN);
3042 SDValue Z = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, P, DNInt, Flags);
3043
3044 // Overflow/underflow guards
3045 SDValue CondHi = DAG.getSetCC(
3046 DL, MVT::i1, X, DAG.getConstantFP(1024.0, DL, MVT::f64), ISD::SETULE);
3047
3048 if (!Flags.hasNoInfs()) {
3049 SDValue PInf = DAG.getConstantFP(std::numeric_limits<double>::infinity(),
3050 DL, MVT::f64);
3051 Z = DAG.getSelect(DL, MVT::f64, CondHi, Z, PInf, Flags);
3052 }
3053
3054 SDValue CondLo = DAG.getSetCC(
3055 DL, MVT::i1, X, DAG.getConstantFP(-1075.0, DL, MVT::f64), ISD::SETUGE);
3056 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
3057 Z = DAG.getSelect(DL, MVT::f64, CondLo, Z, Zero, Flags);
3058
3059 return Z;
3060}
3061
3063 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3064 // If we have to handle denormals, scale up the input and adjust the result.
3065
3066 EVT VT = Op.getValueType();
3067 if (VT == MVT::f64)
3068 return lowerFEXPF64(Op, DAG);
3069
3070 SDLoc SL(Op);
3071 SDValue Src = Op.getOperand(0);
3072 SDNodeFlags Flags = Op->getFlags();
3073
3074 if (VT == MVT::f16) {
3075 // Nothing in half is a denormal when promoted to f32.
3076 assert(!isTypeLegal(MVT::f16));
3077 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
3078 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
3079 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
3080 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3081 }
3082
3083 assert(VT == MVT::f32);
3084
3085 if (!needsDenormHandlingF32(DAG, Src, Flags))
3086 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
3087
3088 // bool needs_scaling = x < -0x1.f80000p+6f;
3089 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3090
3091 // -nextafter(128.0, -1)
3092 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
3093
3094 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3095
3096 SDValue NeedsScaling =
3097 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
3098
3099 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3100 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3101
3102 SDValue AddOffset =
3103 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
3104
3105 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
3106 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
3107
3108 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
3109 SDValue One = DAG.getConstantFP(1.0, SL, VT);
3110 SDValue ResultScale =
3111 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
3112
3113 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
3114}
3115
3117 SelectionDAG &DAG,
3118 SDNodeFlags Flags,
3119 bool IsExp10) const {
3120 // exp(x) -> exp2(M_LOG2E_F * x);
3121 // exp10(x) -> exp2(log2(10) * x);
3122 EVT VT = X.getValueType();
3123 SDValue Const =
3124 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
3125
3126 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
3127 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3128 : (unsigned)ISD::FEXP2,
3129 SL, VT, Mul, Flags);
3130}
3131
3133 SelectionDAG &DAG,
3134 SDNodeFlags Flags) const {
3135 EVT VT = X.getValueType();
3136 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
3137 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3138
3139 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3140
3141 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
3142 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3143
3144 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3145
3146 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3147
3148 SDValue AdjustedX =
3149 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3150
3151 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3152 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3153
3154 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3155
3156 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3157 SDValue AdjustedResult =
3158 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3159
3160 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3161 Flags);
3162}
3163
3164/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3165/// handled correctly.
3167 SelectionDAG &DAG,
3168 SDNodeFlags Flags) const {
3169 const EVT VT = X.getValueType();
3170
3171 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3172 : static_cast<unsigned>(ISD::FEXP2);
3173
3174 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3175 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3176 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3177 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3178
3179 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3180 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3181 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3182 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3183 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3184 }
3185
3186 // bool s = x < -0x1.2f7030p+5f;
3187 // x += s ? 0x1.0p+5f : 0.0f;
3188 // exp10 = exp2(x * 0x1.a92000p+1f) *
3189 // exp2(x * 0x1.4f0978p-11f) *
3190 // (s ? 0x1.9f623ep-107f : 1.0f);
3191
3192 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3193
3194 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3195 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3196
3197 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3198 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3199 SDValue AdjustedX =
3200 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3201
3202 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3203 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3204
3205 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3206 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3207 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3208 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3209
3210 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3211
3212 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3213 SDValue AdjustedResult =
3214 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3215
3216 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3217 Flags);
3218}
3219
3221 EVT VT = Op.getValueType();
3222
3223 if (VT == MVT::f64)
3224 return lowerFEXPF64(Op, DAG);
3225
3226 SDLoc SL(Op);
3227 SDValue X = Op.getOperand(0);
3228 SDNodeFlags Flags = Op->getFlags();
3229 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3230
3231 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3232 // library behavior. Also, is known-not-daz source sufficient?
3233 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3234 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3235 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3236 }
3237
3238 if (VT.getScalarType() == MVT::f16) {
3239 if (VT.isVector())
3240 return SDValue();
3241
3242 // Nothing in half is a denormal when promoted to f32.
3243 //
3244 // exp(f16 x) ->
3245 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3246 //
3247 // exp10(f16 x) ->
3248 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3249 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3250 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3251 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3252 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3253 }
3254
3255 assert(VT == MVT::f32);
3256
3257 // Algorithm:
3258 //
3259 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3260 //
3261 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3262 // n = 64*m + j, 0 <= j < 64
3263 //
3264 // e^x = 2^((64*m + j + f)/64)
3265 // = (2^m) * (2^(j/64)) * 2^(f/64)
3266 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3267 //
3268 // f = x*(64/ln(2)) - n
3269 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3270 //
3271 // e^x = (2^m) * (2^(j/64)) * e^r
3272 //
3273 // (2^(j/64)) is precomputed
3274 //
3275 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3276 // e^r = 1 + q
3277 //
3278 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3279 //
3280 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3281 SDNodeFlags FlagsNoContract = Flags;
3282 FlagsNoContract.setAllowContract(false);
3283
3284 SDValue PH, PL;
3285 if (Subtarget->hasFastFMAF32()) {
3286 const float c_exp = numbers::log2ef;
3287 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3288 const float c_exp10 = 0x1.a934f0p+1f;
3289 const float cc_exp10 = 0x1.2f346ep-24f;
3290
3291 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3292 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3293
3294 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3295 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3296 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3297 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3298 } else {
3299 const float ch_exp = 0x1.714000p+0f;
3300 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3301
3302 const float ch_exp10 = 0x1.a92000p+1f;
3303 const float cl_exp10 = 0x1.4f0978p-11f;
3304
3305 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3306 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3307
3308 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3309 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3310 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3311 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3312 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3313
3314 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3315
3316 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3317 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3318 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3319 }
3320
3321 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3322
3323 // It is unsafe to contract this fsub into the PH multiply.
3324 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3325
3326 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3327 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3328 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3329
3330 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3331
3332 SDValue UnderflowCheckConst =
3333 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3334
3335 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3336 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3337 SDValue Underflow =
3338 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3339
3340 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3341
3342 if (!Flags.hasNoInfs()) {
3343 SDValue OverflowCheckConst =
3344 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3345 SDValue Overflow =
3346 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3347 SDValue Inf =
3349 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3350 }
3351
3352 return R;
3353}
3354
3355static bool isCtlzOpc(unsigned Opc) {
3356 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
3357}
3358
3359static bool isCttzOpc(unsigned Opc) {
3360 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_POISON;
3361}
3362
3364 SelectionDAG &DAG) const {
3365 auto SL = SDLoc(Op);
3366 auto Opc = Op.getOpcode();
3367 auto Arg = Op.getOperand(0u);
3368 auto ResultVT = Op.getValueType();
3369
3370 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3371 return {};
3372
3374 assert(ResultVT == Arg.getValueType());
3375
3376 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3377 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3378 SDValue NewOp;
3379
3380 if (Opc == ISD::CTLZ_ZERO_POISON) {
3381 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3382 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3383 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3384 } else {
3385 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3386 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3387 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3388 }
3389
3390 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3391}
3392
3394 SDLoc SL(Op);
3395 SDValue Src = Op.getOperand(0);
3396
3397 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3398 bool Ctlz = isCtlzOpc(Op.getOpcode());
3399 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3400
3401 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_POISON ||
3402 Op.getOpcode() == ISD::CTTZ_ZERO_POISON;
3403 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3404
3405 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3406 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3407 // (cttz hi:lo) -> (umin (ffbl src), 32)
3408 // (ctlz_zero_poison src) -> (ffbh src)
3409 // (cttz_zero_poison src) -> (ffbl src)
3410
3411 // 64-bit scalar version produce 32-bit result
3412 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3413 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3414 // (ctlz_zero_poison src) -> (S_FLBIT_I32_B64 src)
3415 // (cttz_zero_poison src) -> (S_FF1_I32_B64 src)
3416 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3417 if (!ZeroUndef) {
3418 const SDValue ConstVal = DAG.getConstant(
3419 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3420 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3421 }
3422 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3423 }
3424
3425 SDValue Lo, Hi;
3426 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3427
3428 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3429 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3430
3431 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3432 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3433 // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3434 // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3435
3436 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3437 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3438 if (Ctlz)
3439 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3440 else
3441 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3442
3443 SDValue NewOpr;
3444 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3445 if (!ZeroUndef) {
3446 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3447 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3448 }
3449
3450 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3451}
3452
3454 SDLoc SL(Op);
3455 SDValue Src = Op.getOperand(0);
3456 assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
3457 SDValue Ffbh = DAG.getNode(
3458 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3459 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src);
3460 SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh,
3461 DAG.getConstant(32, SL, MVT::i32));
3462 return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped,
3463 DAG.getAllOnesConstant(SL, MVT::i32));
3464}
3465
3467 bool Signed) const {
3468 // The regular method converting a 64-bit integer to float roughly consists of
3469 // 2 steps: normalization and rounding. In fact, after normalization, the
3470 // conversion from a 64-bit integer to a float is essentially the same as the
3471 // one from a 32-bit integer. The only difference is that it has more
3472 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3473 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3474 // converted into the correct float number. The basic steps for the unsigned
3475 // conversion are illustrated in the following pseudo code:
3476 //
3477 // f32 uitofp(i64 u) {
3478 // i32 hi, lo = split(u);
3479 // // Only count the leading zeros in hi as we have native support of the
3480 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3481 // // reduced to a 32-bit one automatically.
3482 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3483 // u <<= shamt;
3484 // hi, lo = split(u);
3485 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3486 // // convert it as a 32-bit integer and scale the result back.
3487 // return uitofp(hi) * 2^(32 - shamt);
3488 // }
3489 //
3490 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3491 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3492 // converted instead followed by negation based its sign bit.
3493
3494 SDLoc SL(Op);
3495 SDValue Src = Op.getOperand(0);
3496
3497 SDValue Lo, Hi;
3498 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3499 SDValue Sign;
3500 SDValue ShAmt;
3501 if (Signed && Subtarget->isGCN()) {
3502 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3503 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3504 // account. That is, the maximal shift is
3505 // - 32 if Lo and Hi have opposite signs;
3506 // - 33 if Lo and Hi have the same sign.
3507 //
3508 // Or, MaxShAmt = 33 + OppositeSign, where
3509 //
3510 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3511 // - -1 if Lo and Hi have opposite signs; and
3512 // - 0 otherwise.
3513 //
3514 // All in all, ShAmt is calculated as
3515 //
3516 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3517 //
3518 // or
3519 //
3520 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3521 //
3522 // to reduce the critical path.
3523 SDValue OppositeSign = DAG.getNode(
3524 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3525 DAG.getConstant(31, SL, MVT::i32));
3526 SDValue MaxShAmt =
3527 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3528 OppositeSign);
3529 // Count the leading sign bits.
3530 ShAmt = DAG.getNode(
3531 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3532 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi);
3533 // Different from unsigned conversion, the shift should be one bit less to
3534 // preserve the sign bit.
3535 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3536 DAG.getConstant(1, SL, MVT::i32));
3537 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3538 } else {
3539 if (Signed) {
3540 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3541 // absolute value first.
3542 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3543 DAG.getConstant(63, SL, MVT::i64));
3544 SDValue Abs =
3545 DAG.getNode(ISD::XOR, SL, MVT::i64,
3546 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3547 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3548 }
3549 // Count the leading zeros.
3550 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3551 // The shift amount for signed integers is [0, 32].
3552 }
3553 // Normalize the given 64-bit integer.
3554 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3555 // Split it again.
3556 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3557 // Calculate the adjust bit for rounding.
3558 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3559 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3560 DAG.getConstant(1, SL, MVT::i32), Lo);
3561 // Get the 32-bit normalized integer.
3562 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3563 // Convert the normalized 32-bit integer into f32.
3564
3565 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3566 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3567 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3568
3569 // Finally, need to scale back the converted floating number as the original
3570 // 64-bit integer is converted as a 32-bit one.
3571 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3572 ShAmt);
3573 // On GCN, use LDEXP directly.
3574 if (UseLDEXP)
3575 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3576
3577 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3578 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3579 // exponent is enough to avoid overflowing into the sign bit.
3580 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3581 DAG.getConstant(23, SL, MVT::i32));
3582 SDValue IVal =
3583 DAG.getNode(ISD::ADD, SL, MVT::i32,
3584 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3585 if (Signed) {
3586 // Set the sign bit.
3587 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3588 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3589 DAG.getConstant(31, SL, MVT::i32));
3590 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3591 }
3592 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3593}
3594
3596 bool Signed) const {
3597 SDLoc SL(Op);
3598 SDValue Src = Op.getOperand(0);
3599
3600 SDValue Lo, Hi;
3601 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3602
3604 SL, MVT::f64, Hi);
3605
3606 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3607
3608 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3609 DAG.getConstant(32, SL, MVT::i32));
3610 // TODO: Should this propagate fast-math-flags?
3611 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3612}
3613
3615 SelectionDAG &DAG) const {
3616 // TODO: Factor out code common with LowerSINT_TO_FP.
3617 EVT DestVT = Op.getValueType();
3618 SDValue Src = Op.getOperand(0);
3619 EVT SrcVT = Src.getValueType();
3620
3621 if (SrcVT == MVT::i16) {
3622 if (DestVT == MVT::f16)
3623 return Op;
3624 SDLoc DL(Op);
3625
3626 // Promote src to i32
3627 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3628 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3629 }
3630
3631 if (DestVT == MVT::bf16) {
3632 SDLoc SL(Op);
3633 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3634 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3635 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3636 }
3637
3638 if (SrcVT != MVT::i64)
3639 return Op;
3640
3641 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3642 SDLoc DL(Op);
3643
3644 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3645 SDValue FPRoundFlag =
3646 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3647 SDValue FPRound =
3648 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3649
3650 return FPRound;
3651 }
3652
3653 if (DestVT == MVT::f32)
3654 return LowerINT_TO_FP32(Op, DAG, false);
3655
3656 assert(DestVT == MVT::f64);
3657 return LowerINT_TO_FP64(Op, DAG, false);
3658}
3659
3661 SelectionDAG &DAG) const {
3662 EVT DestVT = Op.getValueType();
3663
3664 SDValue Src = Op.getOperand(0);
3665 EVT SrcVT = Src.getValueType();
3666
3667 if (SrcVT == MVT::i16) {
3668 if (DestVT == MVT::f16)
3669 return Op;
3670
3671 SDLoc DL(Op);
3672 // Promote src to i32
3673 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3674 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3675 }
3676
3677 if (DestVT == MVT::bf16) {
3678 SDLoc SL(Op);
3679 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3680 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3681 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3682 }
3683
3684 if (SrcVT != MVT::i64)
3685 return Op;
3686
3687 // TODO: Factor out code common with LowerUINT_TO_FP.
3688
3689 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3690 SDLoc DL(Op);
3691 SDValue Src = Op.getOperand(0);
3692
3693 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3694 SDValue FPRoundFlag =
3695 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3696 SDValue FPRound =
3697 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3698
3699 return FPRound;
3700 }
3701
3702 if (DestVT == MVT::f32)
3703 return LowerINT_TO_FP32(Op, DAG, true);
3704
3705 assert(DestVT == MVT::f64);
3706 return LowerINT_TO_FP64(Op, DAG, true);
3707}
3708
3710 bool Signed) const {
3711 SDLoc SL(Op);
3712
3713 SDValue Src = Op.getOperand(0);
3714 EVT SrcVT = Src.getValueType();
3715
3716 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3717
3718 // The basic idea of converting a floating point number into a pair of 32-bit
3719 // integers is illustrated as follows:
3720 //
3721 // tf := trunc(val);
3722 // hif := floor(tf * 2^-32);
3723 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3724 // hi := fptoi(hif);
3725 // lo := fptoi(lof);
3726 //
3727 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3728 SDValue Sign;
3729 if (Signed && SrcVT == MVT::f32) {
3730 // However, a 32-bit floating point number has only 23 bits mantissa and
3731 // it's not enough to hold all the significant bits of `lof` if val is
3732 // negative. To avoid the loss of precision, We need to take the absolute
3733 // value after truncating and flip the result back based on the original
3734 // signedness.
3735 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3736 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3737 DAG.getConstant(31, SL, MVT::i32));
3738 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3739 }
3740
3741 SDValue K0, K1;
3742 if (SrcVT == MVT::f64) {
3743 K0 = DAG.getConstantFP(
3744 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3745 SrcVT);
3746 K1 = DAG.getConstantFP(
3747 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3748 SrcVT);
3749 } else {
3750 K0 = DAG.getConstantFP(
3751 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3752 K1 = DAG.getConstantFP(
3753 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3754 }
3755 // TODO: Should this propagate fast-math-flags?
3756 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3757
3758 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3759
3760 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3761
3762 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3764 SL, MVT::i32, FloorMul);
3765 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3766
3767 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3768 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3769
3770 if (Signed && SrcVT == MVT::f32) {
3771 assert(Sign);
3772 // Flip the result based on the signedness, which is either all 0s or 1s.
3773 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3774 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3775 // r := xor(r, sign) - sign;
3776 Result =
3777 DAG.getNode(ISD::SUB, SL, MVT::i64,
3778 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3779 }
3780
3781 return Result;
3782}
3783
3785 SDLoc DL(Op);
3786 SDValue N0 = Op.getOperand(0);
3787
3788 // Convert to target node to get known bits
3789 if (N0.getValueType() == MVT::f32)
3790 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3791
3792 if (Op->getFlags().hasApproximateFuncs()) {
3793 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3794 return SDValue();
3795 }
3796
3797 return LowerF64ToF16Safe(N0, DL, DAG);
3798}
3799
3800// return node in i32
3802 SelectionDAG &DAG) const {
3803 assert(Src.getSimpleValueType() == MVT::f64);
3804
3805 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3806 // TODO: We can generate better code for True16.
3807 const unsigned ExpMask = 0x7ff;
3808 const unsigned ExpBiasf64 = 1023;
3809 const unsigned ExpBiasf16 = 15;
3810 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3811 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3812 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3813 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3814 DAG.getConstant(32, DL, MVT::i64));
3815 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3816 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3817 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3818 DAG.getConstant(20, DL, MVT::i64));
3819 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3820 DAG.getConstant(ExpMask, DL, MVT::i32));
3821 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3822 // add the f16 bias (15) to get the biased exponent for the f16 format.
3823 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3824 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3825
3826 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3827 DAG.getConstant(8, DL, MVT::i32));
3828 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3829 DAG.getConstant(0xffe, DL, MVT::i32));
3830
3831 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3832 DAG.getConstant(0x1ff, DL, MVT::i32));
3833 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3834
3835 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3836 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3837
3838 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3839 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3840 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3841 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3842
3843 // N = M | (E << 12);
3844 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3845 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3846 DAG.getConstant(12, DL, MVT::i32)));
3847
3848 // B = clamp(1-E, 0, 13);
3849 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3850 One, E);
3851 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3852 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3853 DAG.getConstant(13, DL, MVT::i32));
3854
3855 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3856 DAG.getConstant(0x1000, DL, MVT::i32));
3857
3858 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3859 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3860 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3861 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3862
3863 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3864 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3865 DAG.getConstant(0x7, DL, MVT::i32));
3866 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3867 DAG.getConstant(2, DL, MVT::i32));
3868 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3869 One, Zero, ISD::SETEQ);
3870 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3871 One, Zero, ISD::SETGT);
3872 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3873 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3874
3875 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3876 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3877 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3878 I, V, ISD::SETEQ);
3879
3880 // Extract the sign bit.
3881 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3882 DAG.getConstant(16, DL, MVT::i32));
3883 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3884 DAG.getConstant(0x8000, DL, MVT::i32));
3885
3886 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3887}
3888
3890 SelectionDAG &DAG) const {
3891 SDValue Src = Op.getOperand(0);
3892 unsigned OpOpcode = Op.getOpcode();
3893 EVT SrcVT = Src.getValueType();
3894 EVT DestVT = Op.getValueType();
3895
3896 // Will be selected natively
3897 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3898 return Op;
3899
3900 if (SrcVT == MVT::bf16) {
3901 SDLoc DL(Op);
3902 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3903 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3904 }
3905
3906 // Promote i16 to i32
3907 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3908 SDLoc DL(Op);
3909
3910 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3911 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3912 }
3913
3914 if (DestVT != MVT::i64)
3915 return Op;
3916
3917 if (SrcVT == MVT::f16 ||
3918 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3919 SDLoc DL(Op);
3920
3921 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3922 unsigned Ext =
3924 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3925 }
3926
3927 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3928 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3929
3930 return SDValue();
3931}
3932
3934 SelectionDAG &DAG) const {
3935 SDValue Src = Op.getOperand(0);
3936 unsigned OpOpcode = Op.getOpcode();
3937 EVT SrcVT = Src.getValueType();
3938 EVT DstVT = Op.getValueType();
3939 SDValue SatVTOp = Op.getNode()->getOperand(1);
3940 EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
3941 SDLoc DL(Op);
3942
3943 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3944 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3945 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3946
3947 // Will be selected natively
3948 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3949 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3950 return Op;
3951
3952 if (DstVT == MVT::i16 && SatWidth == DstWidth && SrcVT == MVT::f16)
3953 return Op;
3954
3955 // Perform all saturation at selected width (i16 or i32) and truncate
3956 if (SatWidth < DstWidth && SatWidth <= 32) {
3957 // For f16 conversion with sub-i16 saturation perform saturation
3958 // at i16, if available in the target. This removes the need for extra f16
3959 // to f32 conversion. For all the others use i32.
3960 MVT ResultVT =
3961 Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16
3962 ? MVT::i16
3963 : MVT::i32;
3964
3965 const SDValue ResultVTOp = DAG.getValueType(ResultVT);
3966 const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();
3967
3968 // First, convert input float into selected integer (i16 or i32)
3969 SDValue FpToInt = DAG.getNode(OpOpcode, DL, ResultVT, Src, ResultVTOp);
3970 SDValue IntSatVal;
3971
3972 // Then, clamp at the saturation width using either i16 or i32 instructions
3973 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3974 SDValue MinConst = DAG.getConstant(
3975 APInt::getSignedMaxValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3976 SDValue MaxConst = DAG.getConstant(
3977 APInt::getSignedMinValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3978 SDValue MinVal = DAG.getNode(ISD::SMIN, DL, ResultVT, FpToInt, MinConst);
3979 IntSatVal = DAG.getNode(ISD::SMAX, DL, ResultVT, MinVal, MaxConst);
3980 } else {
3981 SDValue MinConst = DAG.getConstant(
3982 APInt::getMaxValue(SatWidth).zext(ResultWidth), DL, ResultVT);
3983 IntSatVal = DAG.getNode(ISD::UMIN, DL, ResultVT, FpToInt, MinConst);
3984 }
3985
3986 // Finally, after saturating at i16 or i32 fit into the destination type
3987 return DAG.getExtOrTrunc(OpOpcode == ISD::FP_TO_SINT_SAT, IntSatVal, DL,
3988 DstVT);
3989 }
3990
3991 // SatWidth == DstWidth
3992
3993 // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion
3994 // below)
3995 if (DstVT == MVT::i64 &&
3996 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3997 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3998 const SDValue Int32VTOp = DAG.getValueType(MVT::i32);
3999 return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VTOp);
4000 }
4001
4002 // Promote f16/bf16 src to f32 for i32 conversion
4003 if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
4004 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
4005 return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
4006 }
4007
4008 // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16
4009 // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32
4010 // saturation; this covers i16.f32 and i16.f64
4011 if (DstWidth < 32) {
4012 // Note: this triggers SatWidth < DstWidth above to generate saturated
4013 // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.
4014 MVT PromoteVT =
4015 (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;
4016 SDValue FpToInt = DAG.getNode(OpOpcode, DL, PromoteVT, Src, SatVTOp);
4017 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt);
4018 }
4019
4020 // TODO: can we implement i64 dst for f32/f64?
4021
4022 return SDValue();
4023}
4024
4026 SelectionDAG &DAG) const {
4027 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4028 MVT VT = Op.getSimpleValueType();
4029 MVT ScalarVT = VT.getScalarType();
4030
4031 assert(VT.isVector());
4032
4033 SDValue Src = Op.getOperand(0);
4034 SDLoc DL(Op);
4035
4036 // TODO: Don't scalarize on Evergreen?
4037 unsigned NElts = VT.getVectorNumElements();
4039 DAG.ExtractVectorElements(Src, Args, 0, NElts);
4040
4041 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
4042 for (unsigned I = 0; I < NElts; ++I)
4043 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
4044
4045 return DAG.getBuildVector(VT, DL, Args);
4046}
4047
4048//===----------------------------------------------------------------------===//
4049// Custom DAG optimizations
4050//===----------------------------------------------------------------------===//
4051
4052static bool isU24(SDValue Op, SelectionDAG &DAG) {
4053 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
4054}
4055
4056static bool isI24(SDValue Op, SelectionDAG &DAG) {
4057 EVT VT = Op.getValueType();
4058 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4059 // as unsigned 24-bit values.
4061}
4062
4065 SelectionDAG &DAG = DCI.DAG;
4066 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4067 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4068
4069 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
4070 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
4071 unsigned NewOpcode = Node24->getOpcode();
4072 if (IsIntrin) {
4073 unsigned IID = Node24->getConstantOperandVal(0);
4074 switch (IID) {
4075 case Intrinsic::amdgcn_mul_i24:
4076 NewOpcode = AMDGPUISD::MUL_I24;
4077 break;
4078 case Intrinsic::amdgcn_mul_u24:
4079 NewOpcode = AMDGPUISD::MUL_U24;
4080 break;
4081 case Intrinsic::amdgcn_mulhi_i24:
4082 NewOpcode = AMDGPUISD::MULHI_I24;
4083 break;
4084 case Intrinsic::amdgcn_mulhi_u24:
4085 NewOpcode = AMDGPUISD::MULHI_U24;
4086 break;
4087 default:
4088 llvm_unreachable("Expected 24-bit mul intrinsic");
4089 }
4090 }
4091
4092 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
4093
4094 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4095 // the operands to have other uses, but will only perform simplifications that
4096 // involve bypassing some nodes for this user.
4097 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
4098 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
4099 if (DemandedLHS || DemandedRHS)
4100 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
4101 DemandedLHS ? DemandedLHS : LHS,
4102 DemandedRHS ? DemandedRHS : RHS);
4103
4104 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4105 // operands if this node is the only user.
4106 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
4107 return SDValue(Node24, 0);
4108 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
4109 return SDValue(Node24, 0);
4110
4111 return SDValue();
4112}
4113
4114template <typename IntTy>
4116 uint32_t Width, const SDLoc &DL) {
4117 if (Width + Offset < 32) {
4118 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4119 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4120 if constexpr (std::is_signed_v<IntTy>) {
4121 return DAG.getSignedConstant(Result, DL, MVT::i32);
4122 } else {
4123 return DAG.getConstant(Result, DL, MVT::i32);
4124 }
4125 }
4126
4127 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4128}
4129
4130static bool hasVolatileUser(SDNode *Val) {
4131 for (SDNode *U : Val->users()) {
4132 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
4133 if (M->isVolatile())
4134 return true;
4135 }
4136 }
4137
4138 return false;
4139}
4140
4142 // i32 vectors are the canonical memory type.
4143 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4144 return false;
4145
4146 if (!VT.isByteSized())
4147 return false;
4148
4149 unsigned Size = VT.getStoreSize();
4150
4151 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4152 return false;
4153
4154 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4155 return false;
4156
4157 return true;
4158}
4159
4160// Replace load of an illegal type with a bitcast from a load of a friendlier
4161// type.
4163 DAGCombinerInfo &DCI) const {
4164 if (!DCI.isBeforeLegalize())
4165 return SDValue();
4166
4168 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
4169 return SDValue();
4170
4171 SDLoc SL(N);
4172 SelectionDAG &DAG = DCI.DAG;
4173 EVT VT = LN->getMemoryVT();
4174
4175 unsigned Size = VT.getStoreSize();
4176 Align Alignment = LN->getAlign();
4177 if (Alignment < Size && isTypeLegal(VT)) {
4178 unsigned IsFast;
4179 unsigned AS = LN->getAddressSpace();
4180
4181 // Expand unaligned loads earlier than legalization. Due to visitation order
4182 // problems during legalization, the emitted instructions to pack and unpack
4183 // the bytes again are not eliminated in the case of an unaligned copy.
4185 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
4186 if (VT.isVector())
4187 return SplitVectorLoad(SDValue(LN, 0), DAG);
4188
4189 SDValue Ops[2];
4190 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
4191
4192 return DAG.getMergeValues(Ops, SDLoc(N));
4193 }
4194
4195 if (!IsFast)
4196 return SDValue();
4197 }
4198
4199 if (!shouldCombineMemoryType(VT))
4200 return SDValue();
4201
4202 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4203
4204 SDValue NewLoad
4205 = DAG.getLoad(NewVT, SL, LN->getChain(),
4206 LN->getBasePtr(), LN->getMemOperand());
4207
4208 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
4209 DCI.CombineTo(N, BC, NewLoad.getValue(1));
4210 return SDValue(N, 0);
4211}
4212
4213// Replace store of an illegal type with a store of a bitcast to a friendlier
4214// type.
4216 DAGCombinerInfo &DCI) const {
4217 if (!DCI.isBeforeLegalize())
4218 return SDValue();
4219
4221 if (!SN->isSimple() || !ISD::isNormalStore(SN))
4222 return SDValue();
4223
4224 EVT VT = SN->getMemoryVT();
4225 unsigned Size = VT.getStoreSize();
4226
4227 SDLoc SL(N);
4228 SelectionDAG &DAG = DCI.DAG;
4229 Align Alignment = SN->getAlign();
4230 if (Alignment < Size && isTypeLegal(VT)) {
4231 unsigned IsFast;
4232 unsigned AS = SN->getAddressSpace();
4233
4234 // Expand unaligned stores earlier than legalization. Due to visitation
4235 // order problems during legalization, the emitted instructions to pack and
4236 // unpack the bytes again are not eliminated in the case of an unaligned
4237 // copy.
4239 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
4240 if (VT.isVector())
4241 return SplitVectorStore(SDValue(SN, 0), DAG);
4242
4243 return expandUnalignedStore(SN, DAG);
4244 }
4245
4246 if (!IsFast)
4247 return SDValue();
4248 }
4249
4250 if (!shouldCombineMemoryType(VT))
4251 return SDValue();
4252
4253 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4254 SDValue Val = SN->getValue();
4255
4256 // DCI.AddToWorklist(Val.getNode());
4257
4258 bool OtherUses = !Val.hasOneUse();
4259 SDValue CastVal = DAG.getBitcast(NewVT, Val);
4260 if (OtherUses) {
4261 SDValue CastBack = DAG.getBitcast(VT, CastVal);
4262 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4263 }
4264
4265 return DAG.getStore(SN->getChain(), SL, CastVal,
4266 SN->getBasePtr(), SN->getMemOperand());
4267}
4268
4269// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4270// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4271// issues.
4273 DAGCombinerInfo &DCI) const {
4274 SelectionDAG &DAG = DCI.DAG;
4275 SDValue N0 = N->getOperand(0);
4276
4277 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4278 // (vt2 (truncate (assertzext vt0:x, vt1)))
4279 if (N0.getOpcode() == ISD::TRUNCATE) {
4280 SDValue N1 = N->getOperand(1);
4281 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4282 SDLoc SL(N);
4283
4284 SDValue Src = N0.getOperand(0);
4285 EVT SrcVT = Src.getValueType();
4286 if (SrcVT.bitsGE(ExtVT)) {
4287 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4288 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4289 }
4290 }
4291
4292 return SDValue();
4293}
4294
4296 SDNode *N, DAGCombinerInfo &DCI) const {
4297 unsigned IID = N->getConstantOperandVal(0);
4298 switch (IID) {
4299 case Intrinsic::amdgcn_mul_i24:
4300 case Intrinsic::amdgcn_mul_u24:
4301 case Intrinsic::amdgcn_mulhi_i24:
4302 case Intrinsic::amdgcn_mulhi_u24:
4303 return simplifyMul24(N, DCI);
4304 case Intrinsic::amdgcn_fract:
4305 case Intrinsic::amdgcn_rsq:
4306 case Intrinsic::amdgcn_rcp_legacy:
4307 case Intrinsic::amdgcn_rsq_legacy:
4308 case Intrinsic::amdgcn_rsq_clamp:
4309 case Intrinsic::amdgcn_tanh:
4310 case Intrinsic::amdgcn_prng_b32: {
4311 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4312 SDValue Src = N->getOperand(1);
4313 return Src.isUndef() ? Src : SDValue();
4314 }
4315 case Intrinsic::amdgcn_frexp_exp: {
4316 // frexp_exp (fneg x) -> frexp_exp x
4317 // frexp_exp (fabs x) -> frexp_exp x
4318 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4319 SDValue Src = N->getOperand(1);
4320 SDValue PeekSign = peekFPSignOps(Src);
4321 if (PeekSign == Src)
4322 return SDValue();
4323 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4324 0);
4325 }
4326 default:
4327 return SDValue();
4328 }
4329}
4330
4331/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4332/// binary operation \p Opc to it with the corresponding constant operands.
4334 DAGCombinerInfo &DCI, const SDLoc &SL,
4335 unsigned Opc, SDValue LHS,
4336 uint32_t ValLo, uint32_t ValHi) const {
4337 SelectionDAG &DAG = DCI.DAG;
4338 SDValue Lo, Hi;
4339 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4340
4341 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4342 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4343
4344 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4345 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4346
4347 // Re-visit the ands. It's possible we eliminated one of them and it could
4348 // simplify the vector.
4349 DCI.AddToWorklist(Lo.getNode());
4350 DCI.AddToWorklist(Hi.getNode());
4351
4352 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4353 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4354}
4355
4357 DAGCombinerInfo &DCI) const {
4358 EVT VT = N->getValueType(0);
4359 SDValue LHS = N->getOperand(0);
4360 SDValue RHS = N->getOperand(1);
4362 SDLoc SL(N);
4363 SelectionDAG &DAG = DCI.DAG;
4364
4365 unsigned RHSVal;
4366 if (CRHS) {
4367 RHSVal = CRHS->getZExtValue();
4368 if (!RHSVal)
4369 return LHS;
4370
4371 switch (LHS->getOpcode()) {
4372 default:
4373 break;
4374 case ISD::ZERO_EXTEND:
4375 case ISD::SIGN_EXTEND:
4376 case ISD::ANY_EXTEND: {
4377 SDValue X = LHS->getOperand(0);
4378
4379 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4380 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4381 // Prefer build_vector as the canonical form if packed types are legal.
4382 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4383 SDValue Vec = DAG.getBuildVector(
4384 MVT::v2i16, SL,
4385 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4386 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4387 }
4388
4389 // shl (ext x) => zext (shl x), if shift does not overflow int
4390 if (VT != MVT::i64)
4391 break;
4392 KnownBits Known = DAG.computeKnownBits(X);
4393 unsigned LZ = Known.countMinLeadingZeros();
4394 if (LZ < RHSVal)
4395 break;
4396 EVT XVT = X.getValueType();
4397 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4398 return DAG.getZExtOrTrunc(Shl, SL, VT);
4399 }
4400 }
4401 }
4402
4403 if (VT.getScalarType() != MVT::i64)
4404 return SDValue();
4405
4406 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4407 // common case, splitting this into a move and a 32-bit shift is faster and
4408 // the same code size.
4409 KnownBits Known = DAG.computeKnownBits(RHS);
4410
4411 EVT ElementType = VT.getScalarType();
4412 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4413 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4414
4415 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4416 return SDValue();
4417 SDValue ShiftAmt;
4418
4419 if (CRHS) {
4420 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4421 TargetType);
4422 } else {
4423 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4424 const SDValue ShiftMask =
4425 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4426 // This AND instruction will clamp out of bounds shift values.
4427 // It will also be removed during later instruction selection.
4428 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4429 }
4430
4431 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4432 SDValue NewShift =
4433 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4434
4435 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4436 SDValue Vec;
4437
4438 if (VT.isVector()) {
4439 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4440 unsigned NElts = TargetType.getVectorNumElements();
4442 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4443
4444 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4445 for (unsigned I = 0; I != NElts; ++I)
4446 HiAndLoOps[2 * I + 1] = HiOps[I];
4447 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4448 } else {
4449 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4450 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4451 }
4452 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4453}
4454
4456 DAGCombinerInfo &DCI) const {
4457 SDValue RHS = N->getOperand(1);
4459 EVT VT = N->getValueType(0);
4460 SDValue LHS = N->getOperand(0);
4461 SelectionDAG &DAG = DCI.DAG;
4462 SDLoc SL(N);
4463
4464 if (VT.getScalarType() != MVT::i64)
4465 return SDValue();
4466
4467 // For C >= 32
4468 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4469
4470 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4471 // common case, splitting this into a move and a 32-bit shift is faster and
4472 // the same code size.
4473 KnownBits Known = DAG.computeKnownBits(RHS);
4474
4475 EVT ElementType = VT.getScalarType();
4476 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4477 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4478
4479 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4480 return SDValue();
4481
4482 SDValue ShiftFullAmt =
4483 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4484 SDValue ShiftAmt;
4485 if (CRHS) {
4486 unsigned RHSVal = CRHS->getZExtValue();
4487 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4488 TargetType);
4489 } else if (Known.getMinValue().getZExtValue() ==
4490 (ElementType.getSizeInBits() - 1)) {
4491 ShiftAmt = ShiftFullAmt;
4492 } else {
4493 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4494 const SDValue ShiftMask =
4495 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4496 // This AND instruction will clamp out of bounds shift values.
4497 // It will also be removed during later instruction selection.
4498 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4499 }
4500
4501 EVT ConcatType;
4502 SDValue Hi;
4503 SDLoc LHSSL(LHS);
4504 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4505 if (VT.isVector()) {
4506 unsigned NElts = TargetType.getVectorNumElements();
4507 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4508 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4509 SmallVector<SDValue, 8> HiOps(NElts);
4510 SmallVector<SDValue, 16> HiAndLoOps;
4511
4512 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4513 for (unsigned I = 0; I != NElts; ++I) {
4514 HiOps[I] = HiAndLoOps[2 * I + 1];
4515 }
4516 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4517 } else {
4518 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4519 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4520 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4521 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4522 }
4523
4524 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4525 SDValue NewShift, HiShift;
4526 if (KnownLHS.isNegative()) {
4527 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4528 NewShift =
4529 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4530 } else if (CRHS &&
4531 CRHS->getZExtValue() == (ElementType.getSizeInBits() - 1)) {
4532 NewShift = HiShift =
4533 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4534 } else {
4535 Hi = DAG.getFreeze(Hi);
4536 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4537 NewShift =
4538 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4539 }
4540
4541 SDValue Vec;
4542 if (VT.isVector()) {
4543 unsigned NElts = TargetType.getVectorNumElements();
4546 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4547
4548 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4549 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4550 for (unsigned I = 0; I != NElts; ++I) {
4551 HiAndLoOps[2 * I + 1] = HiOps[I];
4552 HiAndLoOps[2 * I] = LoOps[I];
4553 }
4554 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4555 } else {
4556 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4557 }
4558 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4559}
4560
4562 DAGCombinerInfo &DCI) const {
4563 SDValue RHS = N->getOperand(1);
4565 EVT VT = N->getValueType(0);
4566 SDValue LHS = N->getOperand(0);
4567 SelectionDAG &DAG = DCI.DAG;
4568 SDLoc SL(N);
4569 unsigned RHSVal;
4570
4571 if (CRHS) {
4572 RHSVal = CRHS->getZExtValue();
4573
4574 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4575 // this improves the ability to match BFE patterns in isel.
4576 if (LHS.getOpcode() == ISD::AND) {
4577 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4578 unsigned MaskIdx, MaskLen;
4579 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4580 MaskIdx == RHSVal) {
4581 return DAG.getNode(ISD::AND, SL, VT,
4582 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4583 N->getOperand(1)),
4584 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4585 N->getOperand(1)));
4586 }
4587 }
4588 }
4589 }
4590
4591 if (VT.getScalarType() != MVT::i64)
4592 return SDValue();
4593
4594 // for C >= 32
4595 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4596
4597 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4598 // common case, splitting this into a move and a 32-bit shift is faster and
4599 // the same code size.
4600 KnownBits Known = DAG.computeKnownBits(RHS);
4601
4602 EVT ElementType = VT.getScalarType();
4603 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4604 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4605
4606 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4607 return SDValue();
4608
4609 SDValue ShiftAmt;
4610 if (CRHS) {
4611 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4612 TargetType);
4613 } else {
4614 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4615 const SDValue ShiftMask =
4616 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4617 // This AND instruction will clamp out of bounds shift values.
4618 // It will also be removed during later instruction selection.
4619 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4620 }
4621
4622 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4623 EVT ConcatType;
4624 SDValue Hi;
4625 SDLoc LHSSL(LHS);
4626 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4627 if (VT.isVector()) {
4628 unsigned NElts = TargetType.getVectorNumElements();
4629 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4630 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4631 SmallVector<SDValue, 8> HiOps(NElts);
4632 SmallVector<SDValue, 16> HiAndLoOps;
4633
4634 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4635 for (unsigned I = 0; I != NElts; ++I)
4636 HiOps[I] = HiAndLoOps[2 * I + 1];
4637 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4638 } else {
4639 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4640 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4641 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4642 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4643 }
4644
4645 SDValue NewShift =
4646 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4647
4648 SDValue Vec;
4649 if (VT.isVector()) {
4650 unsigned NElts = TargetType.getVectorNumElements();
4652 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4653
4654 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4655 for (unsigned I = 0; I != NElts; ++I)
4656 HiAndLoOps[2 * I] = LoOps[I];
4657 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4658 } else {
4659 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4660 }
4661 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4662}
4663
4665 SDNode *N, DAGCombinerInfo &DCI) const {
4666 SDLoc SL(N);
4667 SelectionDAG &DAG = DCI.DAG;
4668 EVT VT = N->getValueType(0);
4669 SDValue Src = N->getOperand(0);
4670
4671 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4672 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4673 SDValue Vec = Src.getOperand(0);
4674 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4675 SDValue Elt0 = Vec.getOperand(0);
4676 EVT EltVT = Elt0.getValueType();
4677 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4678 if (EltVT.isFloatingPoint()) {
4679 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4680 EltVT.changeTypeToInteger(), Elt0);
4681 }
4682
4683 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4684 }
4685 }
4686 }
4687
4688 // Equivalent of above for accessing the high element of a vector as an
4689 // integer operation.
4690 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4691 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4692 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4693 SDValue BV = stripBitcast(Src.getOperand(0));
4694 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4695 EVT SrcEltVT = BV.getOperand(0).getValueType();
4696 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4697 unsigned BitIndex = K->getZExtValue();
4698 unsigned PartIndex = BitIndex / SrcEltSize;
4699
4700 if (PartIndex * SrcEltSize == BitIndex &&
4701 PartIndex < BV.getNumOperands()) {
4702 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4703 SDValue SrcElt =
4704 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4705 BV.getOperand(PartIndex));
4706 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4707 }
4708 }
4709 }
4710 }
4711 }
4712
4713 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4714 //
4715 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4716 // i16 (trunc (srl (i32 (trunc x), K)))
4717 if (VT.getScalarSizeInBits() < 32) {
4718 EVT SrcVT = Src.getValueType();
4719 if (SrcVT.getScalarSizeInBits() > 32 &&
4720 (Src.getOpcode() == ISD::SRL ||
4721 Src.getOpcode() == ISD::SRA ||
4722 Src.getOpcode() == ISD::SHL)) {
4723 SDValue Amt = Src.getOperand(1);
4724 KnownBits Known = DAG.computeKnownBits(Amt);
4725
4726 // - For left shifts, do the transform as long as the shift
4727 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4728 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4729 // losing information stored in the high bits when truncating.
4730 const unsigned MaxCstSize =
4731 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4732 if (Known.getMaxValue().ule(MaxCstSize)) {
4733 EVT MidVT = VT.isVector() ?
4734 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4735 VT.getVectorNumElements()) : MVT::i32;
4736
4737 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4738 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4739 Src.getOperand(0));
4740 DCI.AddToWorklist(Trunc.getNode());
4741
4742 if (Amt.getValueType() != NewShiftVT) {
4743 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4744 DCI.AddToWorklist(Amt.getNode());
4745 }
4746
4747 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4748 Trunc, Amt);
4749 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4750 }
4751 }
4752 }
4753
4754 return SDValue();
4755}
4756
4757// We need to specifically handle i64 mul here to avoid unnecessary conversion
4758// instructions. If we only match on the legalized i64 mul expansion,
4759// SimplifyDemandedBits will be unable to remove them because there will be
4760// multiple uses due to the separate mul + mulh[su].
4761static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4762 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4763 if (Size <= 32) {
4764 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4765 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4766 }
4767
4768 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4769 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4770
4771 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4772 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4773
4774 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4775}
4776
4777/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4778/// return SDValue().
4779static SDValue getAddOneOp(const SDNode *V) {
4780 if (V->getOpcode() != ISD::ADD)
4781 return SDValue();
4782
4783 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4784}
4785
4787 DAGCombinerInfo &DCI) const {
4788 assert(N->getOpcode() == ISD::MUL);
4789 EVT VT = N->getValueType(0);
4790
4791 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4792 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4793 // unnecessarily). isDivergent() is used as an approximation of whether the
4794 // value is in an SGPR.
4795 if (!N->isDivergent())
4796 return SDValue();
4797
4798 unsigned Size = VT.getSizeInBits();
4799 if (VT.isVector() || Size > 64)
4800 return SDValue();
4801
4802 SelectionDAG &DAG = DCI.DAG;
4803 SDLoc DL(N);
4804
4805 SDValue N0 = N->getOperand(0);
4806 SDValue N1 = N->getOperand(1);
4807
4808 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4809 // matching.
4810
4811 // mul x, (add y, 1) -> add (mul x, y), x
4812 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4813 SDValue AddOp = getAddOneOp(V.getNode());
4814 if (!AddOp)
4815 return SDValue();
4816
4817 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4818 return U->getOpcode() == ISD::MUL;
4819 }))
4820 return AddOp;
4821
4822 return SDValue();
4823 };
4824
4825 // FIXME: The selection pattern is not properly checking for commuted
4826 // operands, so we have to place the mul in the LHS
4827 if (SDValue MulOper = IsFoldableAdd(N0)) {
4828 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4829 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4830 }
4831
4832 if (SDValue MulOper = IsFoldableAdd(N1)) {
4833 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4834 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4835 }
4836
4837 // There are i16 integer mul/mad.
4838 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4839 return SDValue();
4840
4841 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4842 // in the source into any_extends if the result of the mul is truncated. Since
4843 // we can assume the high bits are whatever we want, use the underlying value
4844 // to avoid the unknown high bits from interfering.
4845 if (N0.getOpcode() == ISD::ANY_EXTEND)
4846 N0 = N0.getOperand(0);
4847
4848 if (N1.getOpcode() == ISD::ANY_EXTEND)
4849 N1 = N1.getOperand(0);
4850
4851 SDValue Mul;
4852
4853 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4854 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4855 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4856 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4857 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4858 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4859 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4860 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4861 } else {
4862 return SDValue();
4863 }
4864
4865 // We need to use sext even for MUL_U24, because MUL_U24 is used
4866 // for signed multiply of 8 and 16-bit types.
4867 return DAG.getSExtOrTrunc(Mul, DL, VT);
4868}
4869
4870SDValue
4872 DAGCombinerInfo &DCI) const {
4873 if (N->getValueType(0) != MVT::i32)
4874 return SDValue();
4875
4876 SelectionDAG &DAG = DCI.DAG;
4877 SDLoc DL(N);
4878
4879 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4880 SDValue N0 = N->getOperand(0);
4881 SDValue N1 = N->getOperand(1);
4882
4883 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4884 // in the source into any_extends if the result of the mul is truncated. Since
4885 // we can assume the high bits are whatever we want, use the underlying value
4886 // to avoid the unknown high bits from interfering.
4887 if (N0.getOpcode() == ISD::ANY_EXTEND)
4888 N0 = N0.getOperand(0);
4889 if (N1.getOpcode() == ISD::ANY_EXTEND)
4890 N1 = N1.getOperand(0);
4891
4892 // Try to use two fast 24-bit multiplies (one for each half of the result)
4893 // instead of one slow extending multiply.
4894 unsigned LoOpcode = 0;
4895 unsigned HiOpcode = 0;
4896 if (Signed) {
4897 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4898 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4899 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4900 LoOpcode = AMDGPUISD::MUL_I24;
4901 HiOpcode = AMDGPUISD::MULHI_I24;
4902 }
4903 } else {
4904 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4905 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4906 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4907 LoOpcode = AMDGPUISD::MUL_U24;
4908 HiOpcode = AMDGPUISD::MULHI_U24;
4909 }
4910 }
4911 if (!LoOpcode)
4912 return SDValue();
4913
4914 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4915 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4916 DCI.CombineTo(N, Lo, Hi);
4917 return SDValue(N, 0);
4918}
4919
4921 DAGCombinerInfo &DCI) const {
4922 EVT VT = N->getValueType(0);
4923
4924 if (!Subtarget->hasMulI24() || VT.isVector())
4925 return SDValue();
4926
4927 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4928 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4929 // unnecessarily). isDivergent() is used as an approximation of whether the
4930 // value is in an SGPR.
4931 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4932 // valu op anyway)
4933 if (Subtarget->hasSMulHi() && !N->isDivergent())
4934 return SDValue();
4935
4936 SelectionDAG &DAG = DCI.DAG;
4937 SDLoc DL(N);
4938
4939 SDValue N0 = N->getOperand(0);
4940 SDValue N1 = N->getOperand(1);
4941
4942 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4943 return SDValue();
4944
4945 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4946 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4947
4948 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4949 DCI.AddToWorklist(Mulhi.getNode());
4950 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4951}
4952
4954 DAGCombinerInfo &DCI) const {
4955 EVT VT = N->getValueType(0);
4956
4957 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4958 return SDValue();
4959
4960 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4961 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4962 // unnecessarily). isDivergent() is used as an approximation of whether the
4963 // value is in an SGPR.
4964 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4965 // valu op anyway)
4966 if (!N->isDivergent() && Subtarget->hasSMulHi())
4967 return SDValue();
4968
4969 SelectionDAG &DAG = DCI.DAG;
4970 SDLoc DL(N);
4971
4972 SDValue N0 = N->getOperand(0);
4973 SDValue N1 = N->getOperand(1);
4974
4975 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4976 return SDValue();
4977
4978 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4979 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4980
4981 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4982 DCI.AddToWorklist(Mulhi.getNode());
4983 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4984}
4985
4986SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4987 SDValue Op,
4988 const SDLoc &DL,
4989 unsigned Opc) const {
4990 EVT VT = Op.getValueType();
4991 if (VT.bitsGT(MVT::i32))
4992 return SDValue();
4993
4994 if (VT != MVT::i32)
4995 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4996
4997 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4998 if (VT != MVT::i32)
4999 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
5000
5001 return FFBX;
5002}
5003
5004// The native instructions return -1 on 0 input. Optimize out a select that
5005// produces -1 on 0.
5006//
5007// TODO: If zero is not undef, we could also do this if the output is compared
5008// against the bitwidth.
5009//
5010// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
5012 SDValue LHS, SDValue RHS,
5013 DAGCombinerInfo &DCI) const {
5014 if (!isNullConstant(Cond.getOperand(1)))
5015 return SDValue();
5016
5017 SelectionDAG &DAG = DCI.DAG;
5018 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
5019 SDValue CmpLHS = Cond.getOperand(0);
5020
5021 // select (setcc x, 0, eq), -1, (ctlz_zero_poison x) -> ffbh_u32 x
5022 // select (setcc x, 0, eq), -1, (cttz_zero_poison x) -> ffbl_u32 x
5023 if (CCOpcode == ISD::SETEQ &&
5024 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
5025 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
5026 unsigned Opc =
5027 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5028 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5029 }
5030
5031 // select (setcc x, 0, ne), (ctlz_zero_poison x), -1 -> ffbh_u32 x
5032 // select (setcc x, 0, ne), (cttz_zero_poison x), -1 -> ffbl_u32 x
5033 if (CCOpcode == ISD::SETNE &&
5034 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
5035 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
5036 unsigned Opc =
5037 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5038
5039 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5040 }
5041
5042 return SDValue();
5043}
5044
5046 unsigned Op,
5047 const SDLoc &SL,
5048 SDValue Cond,
5049 SDValue N1,
5050 SDValue N2) {
5051 SelectionDAG &DAG = DCI.DAG;
5052 EVT VT = N1.getValueType();
5053
5054 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
5055 N1.getOperand(0), N2.getOperand(0));
5056 DCI.AddToWorklist(NewSelect.getNode());
5057 return DAG.getNode(Op, SL, VT, NewSelect);
5058}
5059
5060// Pull a free FP operation out of a select so it may fold into uses.
5061//
5062// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
5063// select c, (fneg x), k -> fneg (select c, x, (fneg k))
5064//
5065// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5066// select c, (fabs x), +k -> fabs (select c, x, k)
5067SDValue
5069 SDValue N) const {
5070 SelectionDAG &DAG = DCI.DAG;
5071 SDValue Cond = N.getOperand(0);
5072 SDValue LHS = N.getOperand(1);
5073 SDValue RHS = N.getOperand(2);
5074
5075 EVT VT = N.getValueType();
5076 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5077 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5079 return SDValue();
5080
5081 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
5082 SDLoc(N), Cond, LHS, RHS);
5083 }
5084
5085 bool Inv = false;
5086 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5087 std::swap(LHS, RHS);
5088 Inv = true;
5089 }
5090
5091 // TODO: Support vector constants.
5093 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5094 !selectSupportsSourceMods(N.getNode())) {
5095 SDLoc SL(N);
5096 // If one side is an fneg/fabs and the other is a constant, we can push the
5097 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5098 SDValue NewLHS = LHS.getOperand(0);
5099 SDValue NewRHS = RHS;
5100
5101 // Careful: if the neg can be folded up, don't try to pull it back down.
5102 bool ShouldFoldNeg = true;
5103
5104 if (NewLHS.hasOneUse()) {
5105 unsigned Opc = NewLHS.getOpcode();
5106 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
5107 ShouldFoldNeg = false;
5108 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5109 ShouldFoldNeg = false;
5110 }
5111
5112 if (ShouldFoldNeg) {
5113 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5114 return SDValue();
5115
5116 // We're going to be forced to use a source modifier anyway, there's no
5117 // point to pulling the negate out unless we can get a size reduction by
5118 // negating the constant.
5119 //
5120 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5121 // about cheaper constants.
5122 if (NewLHS.getOpcode() == ISD::FABS &&
5124 return SDValue();
5125
5127 return SDValue();
5128
5129 if (LHS.getOpcode() == ISD::FNEG)
5130 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5131
5132 if (Inv)
5133 std::swap(NewLHS, NewRHS);
5134
5135 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
5136 Cond, NewLHS, NewRHS);
5137 DCI.AddToWorklist(NewSelect.getNode());
5138 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
5139 }
5140 }
5141
5142 return SDValue();
5143}
5144
5146 DAGCombinerInfo &DCI) const {
5147 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
5148 return Folded;
5149
5150 SDValue Cond = N->getOperand(0);
5151 if (Cond.getOpcode() != ISD::SETCC)
5152 return SDValue();
5153
5154 EVT VT = N->getValueType(0);
5155 SDValue LHS = Cond.getOperand(0);
5156 SDValue RHS = Cond.getOperand(1);
5157 SDValue CC = Cond.getOperand(2);
5158
5159 SDValue True = N->getOperand(1);
5160 SDValue False = N->getOperand(2);
5161
5162 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5163 SelectionDAG &DAG = DCI.DAG;
5164 if (DAG.isConstantValueOfAnyType(True) &&
5165 !DAG.isConstantValueOfAnyType(False)) {
5166 // Swap cmp + select pair to move constant to false input.
5167 // This will allow using VOPC cndmasks more often.
5168 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5169
5170 SDLoc SL(N);
5171 ISD::CondCode NewCC =
5172 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
5173
5174 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
5175 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
5176 }
5177
5178 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5180 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5181 // Revisit this node so we can catch min3/max3/med3 patterns.
5182 //DCI.AddToWorklist(MinMax.getNode());
5183 return MinMax;
5184 }
5185 }
5186
5187 // There's no reason to not do this if the condition has other uses.
5188 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
5189}
5190
5191static bool isInv2Pi(const APFloat &APF) {
5192 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5193 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5194 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5195
5196 return APF.bitwiseIsEqual(KF16) ||
5197 APF.bitwiseIsEqual(KF32) ||
5198 APF.bitwiseIsEqual(KF64);
5199}
5200
5201// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5202// additional cost to negate them.
5205 if (C->isZero())
5206 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5207
5208 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
5209 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5210
5212}
5213
5219
5225
5226static unsigned inverseMinMax(unsigned Opc) {
5227 switch (Opc) {
5228 case ISD::FMAXNUM:
5229 return ISD::FMINNUM;
5230 case ISD::FMINNUM:
5231 return ISD::FMAXNUM;
5232 case ISD::FMAXNUM_IEEE:
5233 return ISD::FMINNUM_IEEE;
5234 case ISD::FMINNUM_IEEE:
5235 return ISD::FMAXNUM_IEEE;
5236 case ISD::FMAXIMUM:
5237 return ISD::FMINIMUM;
5238 case ISD::FMINIMUM:
5239 return ISD::FMAXIMUM;
5240 case ISD::FMAXIMUMNUM:
5241 return ISD::FMINIMUMNUM;
5242 case ISD::FMINIMUMNUM:
5243 return ISD::FMAXIMUMNUM;
5244 case AMDGPUISD::FMAX_LEGACY:
5245 return AMDGPUISD::FMIN_LEGACY;
5246 case AMDGPUISD::FMIN_LEGACY:
5247 return AMDGPUISD::FMAX_LEGACY;
5248 default:
5249 llvm_unreachable("invalid min/max opcode");
5250 }
5251}
5252
5253/// \return true if it's profitable to try to push an fneg into its source
5254/// instruction.
5256 // If the input has multiple uses and we can either fold the negate down, or
5257 // the other uses cannot, give up. This both prevents unprofitable
5258 // transformations and infinite loops: we won't repeatedly try to fold around
5259 // a negate that has no 'good' form.
5260 if (N0.hasOneUse()) {
5261 // This may be able to fold into the source, but at a code size cost. Don't
5262 // fold if the fold into the user is free.
5263 if (allUsesHaveSourceMods(N, 0))
5264 return false;
5265 } else {
5266 if (fnegFoldsIntoOp(N0.getNode()) &&
5268 return false;
5269 }
5270
5271 return true;
5272}
5273
5275 DAGCombinerInfo &DCI) const {
5276 SelectionDAG &DAG = DCI.DAG;
5277 SDValue N0 = N->getOperand(0);
5278 EVT VT = N->getValueType(0);
5279
5280 unsigned Opc = N0.getOpcode();
5281
5282 if (!shouldFoldFNegIntoSrc(N, N0))
5283 return SDValue();
5284
5285 SDLoc SL(N);
5286 switch (Opc) {
5287 case ISD::FADD: {
5288 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5289 return SDValue();
5290
5291 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5292 SDValue LHS = N0.getOperand(0);
5293 SDValue RHS = N0.getOperand(1);
5294
5295 if (LHS.getOpcode() != ISD::FNEG)
5296 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5297 else
5298 LHS = LHS.getOperand(0);
5299
5300 if (RHS.getOpcode() != ISD::FNEG)
5301 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5302 else
5303 RHS = RHS.getOperand(0);
5304
5305 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5306 if (Res.getOpcode() != ISD::FADD)
5307 return SDValue(); // Op got folded away.
5308 if (!N0.hasOneUse())
5309 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5310 return Res;
5311 }
5312 case ISD::FMUL:
5313 case AMDGPUISD::FMUL_LEGACY: {
5314 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5315 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5316 SDValue LHS = N0.getOperand(0);
5317 SDValue RHS = N0.getOperand(1);
5318
5319 if (LHS.getOpcode() == ISD::FNEG)
5320 LHS = LHS.getOperand(0);
5321 else if (RHS.getOpcode() == ISD::FNEG)
5322 RHS = RHS.getOperand(0);
5323 else
5324 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5325
5326 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5327 if (Res.getOpcode() != Opc)
5328 return SDValue(); // Op got folded away.
5329 if (!N0.hasOneUse())
5330 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5331 return Res;
5332 }
5333 case ISD::FMA:
5334 case ISD::FMAD: {
5335 // TODO: handle llvm.amdgcn.fma.legacy
5336 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5337 return SDValue();
5338
5339 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5340 SDValue LHS = N0.getOperand(0);
5341 SDValue MHS = N0.getOperand(1);
5342 SDValue RHS = N0.getOperand(2);
5343
5344 if (LHS.getOpcode() == ISD::FNEG)
5345 LHS = LHS.getOperand(0);
5346 else if (MHS.getOpcode() == ISD::FNEG)
5347 MHS = MHS.getOperand(0);
5348 else
5349 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5350
5351 if (RHS.getOpcode() != ISD::FNEG)
5352 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5353 else
5354 RHS = RHS.getOperand(0);
5355
5356 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5357 if (Res.getOpcode() != Opc)
5358 return SDValue(); // Op got folded away.
5359 if (!N0.hasOneUse())
5360 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5361 return Res;
5362 }
5363 case ISD::FMAXNUM:
5364 case ISD::FMINNUM:
5365 case ISD::FMAXNUM_IEEE:
5366 case ISD::FMINNUM_IEEE:
5367 case ISD::FMINIMUM:
5368 case ISD::FMAXIMUM:
5369 case ISD::FMINIMUMNUM:
5370 case ISD::FMAXIMUMNUM:
5371 case AMDGPUISD::FMAX_LEGACY:
5372 case AMDGPUISD::FMIN_LEGACY: {
5373 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5374 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5375 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5376 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5377
5378 SDValue LHS = N0.getOperand(0);
5379 SDValue RHS = N0.getOperand(1);
5380
5381 // 0 doesn't have a negated inline immediate.
5382 // TODO: This constant check should be generalized to other operations.
5384 return SDValue();
5385
5386 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5387 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5388 unsigned Opposite = inverseMinMax(Opc);
5389
5390 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5391 if (Res.getOpcode() != Opposite)
5392 return SDValue(); // Op got folded away.
5393 if (!N0.hasOneUse())
5394 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5395 return Res;
5396 }
5397 case AMDGPUISD::FMED3: {
5398 SDValue Ops[3];
5399 for (unsigned I = 0; I < 3; ++I)
5400 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5401
5402 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5403 if (Res.getOpcode() != AMDGPUISD::FMED3)
5404 return SDValue(); // Op got folded away.
5405
5406 if (!N0.hasOneUse()) {
5407 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5408 DAG.ReplaceAllUsesWith(N0, Neg);
5409
5410 for (SDNode *U : Neg->users())
5411 DCI.AddToWorklist(U);
5412 }
5413
5414 return Res;
5415 }
5416 case ISD::FP_EXTEND:
5417 case ISD::FTRUNC:
5418 case ISD::FRINT:
5419 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5420 case ISD::FROUNDEVEN:
5421 case ISD::FSIN:
5422 case ISD::FCANONICALIZE:
5423 case AMDGPUISD::RCP:
5424 case AMDGPUISD::RCP_LEGACY:
5425 case AMDGPUISD::RCP_IFLAG:
5426 case AMDGPUISD::SIN_HW: {
5427 SDValue CvtSrc = N0.getOperand(0);
5428 if (CvtSrc.getOpcode() == ISD::FNEG) {
5429 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5430 // (fneg (rcp (fneg x))) -> (rcp x)
5431 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5432 }
5433
5434 if (!N0.hasOneUse())
5435 return SDValue();
5436
5437 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5438 // (fneg (rcp x)) -> (rcp (fneg x))
5439 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5440 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5441 }
5442 case ISD::FP_ROUND: {
5443 SDValue CvtSrc = N0.getOperand(0);
5444
5445 if (CvtSrc.getOpcode() == ISD::FNEG) {
5446 // (fneg (fp_round (fneg x))) -> (fp_round x)
5447 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5448 CvtSrc.getOperand(0), N0.getOperand(1));
5449 }
5450
5451 if (!N0.hasOneUse())
5452 return SDValue();
5453
5454 // (fneg (fp_round x)) -> (fp_round (fneg x))
5455 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5456 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5457 }
5458 case ISD::FP16_TO_FP: {
5459 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5460 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5461 // Put the fneg back as a legal source operation that can be matched later.
5462 SDLoc SL(N);
5463
5464 SDValue Src = N0.getOperand(0);
5465 EVT SrcVT = Src.getValueType();
5466
5467 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5468 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5469 DAG.getConstant(0x8000, SL, SrcVT));
5470 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5471 }
5472 case ISD::SELECT: {
5473 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5474 // TODO: Invert conditions of foldFreeOpFromSelect
5475 return SDValue();
5476 }
5477 case ISD::BITCAST: {
5478 SDLoc SL(N);
5479 SDValue BCSrc = N0.getOperand(0);
5480 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5481 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5482 if (VT != MVT::f64 || HighBits.getValueType().getSizeInBits() != 32 ||
5483 !fnegFoldsIntoOp(HighBits.getNode()))
5484 return SDValue();
5485
5486 // f64 fneg only really needs to operate on the high half of of the
5487 // register, so try to force it to an f32 operation to help make use of
5488 // source modifiers.
5489 //
5490 //
5491 // fneg (f64 (bitcast (build_vector x, y))) ->
5492 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5493 // (fneg (bitcast i32:y to f32)))
5494
5495 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5496 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5497 SDValue CastBack =
5498 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5499
5501 Ops.back() = CastBack;
5502 DCI.AddToWorklist(NegHi.getNode());
5503 SDValue Build =
5504 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5505 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5506
5507 if (!N0.hasOneUse())
5508 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5509 return Result;
5510 }
5511
5512 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5513 BCSrc.hasOneUse()) {
5514 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5515 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5516
5517 // TODO: Cast back result for multiple uses is beneficial in some cases.
5518
5519 SDValue LHS =
5520 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5521 SDValue RHS =
5522 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5523
5524 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5525 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5526
5527 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5528 NegRHS);
5529 }
5530
5531 return SDValue();
5532 }
5533 default:
5534 return SDValue();
5535 }
5536}
5537
5539 DAGCombinerInfo &DCI) const {
5540 SelectionDAG &DAG = DCI.DAG;
5541 SDValue N0 = N->getOperand(0);
5542
5543 if (!N0.hasOneUse())
5544 return SDValue();
5545
5546 switch (N0.getOpcode()) {
5547 case ISD::FP16_TO_FP: {
5548 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5549 SDLoc SL(N);
5550 SDValue Src = N0.getOperand(0);
5551 EVT SrcVT = Src.getValueType();
5552
5553 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5554 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5555 DAG.getConstant(0x7fff, SL, SrcVT));
5556 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5557 }
5558 default:
5559 return SDValue();
5560 }
5561}
5562
5564 DAGCombinerInfo &DCI) const {
5565 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5566 if (!CFP)
5567 return SDValue();
5568
5569 // XXX - Should this flush denormals?
5570 const APFloat &Val = CFP->getValueAPF();
5571 APFloat One(Val.getSemantics(), "1.0");
5572 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5573}
5574
5576 if (!Subtarget->isGCN())
5577 return false;
5578
5581 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5582 const auto *TII = ST.getInstrInfo();
5583
5584 if (!ST.hasVMovB64Inst() || (!SDConstant && !SDFPConstant))
5585 return false;
5586
5587 if (ST.has64BitLiterals())
5588 return true;
5589
5590 if (SDConstant) {
5591 const APInt &APVal = SDConstant->getAPIntValue();
5592 return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal);
5593 }
5594
5595 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5596 return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val);
5597}
5598
5600 DAGCombinerInfo &DCI) const {
5601 SelectionDAG &DAG = DCI.DAG;
5602 SDLoc DL(N);
5603
5604 switch(N->getOpcode()) {
5605 default:
5606 break;
5607 case ISD::BITCAST: {
5608 EVT DestVT = N->getValueType(0);
5609
5610 // Push casts through vector builds. This helps avoid emitting a large
5611 // number of copies when materializing floating point vector constants.
5612 //
5613 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5614 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5615 if (DestVT.isVector()) {
5616 SDValue Src = N->getOperand(0);
5617 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5620 EVT SrcVT = Src.getValueType();
5621 unsigned NElts = DestVT.getVectorNumElements();
5622
5623 if (SrcVT.getVectorNumElements() == NElts) {
5624 EVT DestEltVT = DestVT.getVectorElementType();
5625
5626 SmallVector<SDValue, 8> CastedElts;
5627 SDLoc SL(N);
5628 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5629 SDValue Elt = Src.getOperand(I);
5630 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5631 }
5632
5633 return DAG.getBuildVector(DestVT, SL, CastedElts);
5634 }
5635 }
5636 }
5637
5638 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5639 break;
5640
5641 // Fold bitcasts of constants.
5642 //
5643 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5644 // TODO: Generalize and move to DAGCombiner
5645 SDValue Src = N->getOperand(0);
5647 SDLoc SL(N);
5648 if (isInt64ImmLegal(C, DAG))
5649 break;
5650 uint64_t CVal = C->getZExtValue();
5651 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5652 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5653 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5654 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5655 }
5656
5658 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5659 SDLoc SL(N);
5660 if (isInt64ImmLegal(C, DAG))
5661 break;
5662 uint64_t CVal = Val.getZExtValue();
5663 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5664 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5665 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5666
5667 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5668 }
5669
5670 break;
5671 }
5672 case ISD::SHL:
5673 case ISD::SRA:
5674 case ISD::SRL: {
5675 // Range metadata can be invalidated when loads are converted to legal types
5676 // (e.g. v2i64 -> v4i32).
5677 // Try to convert vector shl/sra/srl before type legalization so that range
5678 // metadata can be utilized.
5679 if (!(N->getValueType(0).isVector() &&
5682 break;
5683 if (N->getOpcode() == ISD::SHL)
5684 return performShlCombine(N, DCI);
5685 if (N->getOpcode() == ISD::SRA)
5686 return performSraCombine(N, DCI);
5687 return performSrlCombine(N, DCI);
5688 }
5689 case ISD::TRUNCATE:
5690 return performTruncateCombine(N, DCI);
5691 case ISD::MUL:
5692 return performMulCombine(N, DCI);
5693 case AMDGPUISD::MUL_U24:
5694 case AMDGPUISD::MUL_I24: {
5695 if (SDValue Simplified = simplifyMul24(N, DCI))
5696 return Simplified;
5697 break;
5698 }
5699 case AMDGPUISD::MULHI_I24:
5700 case AMDGPUISD::MULHI_U24:
5701 return simplifyMul24(N, DCI);
5702 case ISD::SMUL_LOHI:
5703 case ISD::UMUL_LOHI:
5704 return performMulLoHiCombine(N, DCI);
5705 case ISD::MULHS:
5706 return performMulhsCombine(N, DCI);
5707 case ISD::MULHU:
5708 return performMulhuCombine(N, DCI);
5709 case ISD::SELECT:
5710 return performSelectCombine(N, DCI);
5711 case ISD::FNEG:
5712 return performFNegCombine(N, DCI);
5713 case ISD::FABS:
5714 return performFAbsCombine(N, DCI);
5715 case AMDGPUISD::BFE_I32:
5716 case AMDGPUISD::BFE_U32: {
5717 assert(!N->getValueType(0).isVector() &&
5718 "Vector handling of BFE not implemented");
5719 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5720 if (!Width)
5721 break;
5722
5723 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5724 if (WidthVal == 0)
5725 return DAG.getConstant(0, DL, MVT::i32);
5726
5728 if (!Offset)
5729 break;
5730
5731 SDValue BitsFrom = N->getOperand(0);
5732 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5733
5734 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5735
5736 if (OffsetVal == 0) {
5737 // This is already sign / zero extended, so try to fold away extra BFEs.
5738 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5739
5740 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5741 if (OpSignBits >= SignBits)
5742 return BitsFrom;
5743
5744 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5745 if (Signed) {
5746 // This is a sign_extend_inreg. Replace it to take advantage of existing
5747 // DAG Combines. If not eliminated, we will match back to BFE during
5748 // selection.
5749
5750 // TODO: The sext_inreg of extended types ends, although we can could
5751 // handle them in a single BFE.
5752 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5753 DAG.getValueType(SmallVT));
5754 }
5755
5756 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5757 }
5758
5759 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5760 if (Signed) {
5761 return constantFoldBFE<int32_t>(DAG,
5762 CVal->getSExtValue(),
5763 OffsetVal,
5764 WidthVal,
5765 DL);
5766 }
5767
5768 return constantFoldBFE<uint32_t>(DAG,
5769 CVal->getZExtValue(),
5770 OffsetVal,
5771 WidthVal,
5772 DL);
5773 }
5774
5775 if ((OffsetVal + WidthVal) >= 32 &&
5776 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5777 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5778 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5779 BitsFrom, ShiftVal);
5780 }
5781
5782 if (BitsFrom.hasOneUse()) {
5783 APInt Demanded = APInt::getBitsSet(32,
5784 OffsetVal,
5785 OffsetVal + WidthVal);
5786
5787 KnownBits Known;
5789 !DCI.isBeforeLegalizeOps());
5790 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5791 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5792 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5793 DCI.CommitTargetLoweringOpt(TLO);
5794 }
5795 }
5796
5797 break;
5798 }
5799 case ISD::LOAD:
5800 return performLoadCombine(N, DCI);
5801 case ISD::STORE:
5802 return performStoreCombine(N, DCI);
5803 case AMDGPUISD::RCP:
5804 case AMDGPUISD::RCP_IFLAG:
5805 return performRcpCombine(N, DCI);
5806 case ISD::AssertZext:
5807 case ISD::AssertSext:
5808 return performAssertSZExtCombine(N, DCI);
5810 return performIntrinsicWOChainCombine(N, DCI);
5811 case AMDGPUISD::FMAD_FTZ: {
5812 SDValue N0 = N->getOperand(0);
5813 SDValue N1 = N->getOperand(1);
5814 SDValue N2 = N->getOperand(2);
5815 EVT VT = N->getValueType(0);
5816
5817 // FMAD_FTZ is a FMAD + flush denormals to zero.
5818 // We flush the inputs, the intermediate step, and the output.
5822 if (N0CFP && N1CFP && N2CFP) {
5823 const auto FTZ = [](const APFloat &V) {
5824 if (V.isDenormal()) {
5825 APFloat Zero(V.getSemantics(), 0);
5826 return V.isNegative() ? -Zero : Zero;
5827 }
5828 return V;
5829 };
5830
5831 APFloat V0 = FTZ(N0CFP->getValueAPF());
5832 APFloat V1 = FTZ(N1CFP->getValueAPF());
5833 APFloat V2 = FTZ(N2CFP->getValueAPF());
5835 V0 = FTZ(V0);
5837 return DAG.getConstantFP(FTZ(V0), DL, VT);
5838 }
5839 break;
5840 }
5841 }
5842 return SDValue();
5843}
5844
5846 SDValue Op, const APInt &OriginalDemandedBits,
5847 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
5848 unsigned Depth) const {
5849 switch (Op.getOpcode()) {
5851 switch (Op.getConstantOperandVal(0)) {
5852 case Intrinsic::amdgcn_readfirstlane:
5853 case Intrinsic::amdgcn_readlane:
5854 case Intrinsic::amdgcn_set_inactive:
5855 case Intrinsic::amdgcn_wwm: {
5856 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
5857 OriginalDemandedElts, Known, TLO, Depth + 1))
5858 return true;
5859 break;
5860 }
5861 default:
5862 break;
5863 }
5864 break;
5865 }
5866 default:
5867 break;
5868 }
5869
5870 return false;
5871}
5872
5873//===----------------------------------------------------------------------===//
5874// Helper functions
5875//===----------------------------------------------------------------------===//
5876
5878 const TargetRegisterClass *RC,
5879 Register Reg, EVT VT,
5880 const SDLoc &SL,
5881 bool RawReg) const {
5883 MachineRegisterInfo &MRI = MF.getRegInfo();
5884 Register VReg;
5885
5886 if (!MRI.isLiveIn(Reg)) {
5887 VReg = MRI.createVirtualRegister(RC);
5888 MRI.addLiveIn(Reg, VReg);
5889 } else {
5890 VReg = MRI.getLiveInVirtReg(Reg);
5891 }
5892
5893 if (RawReg)
5894 return DAG.getRegister(VReg, VT);
5895
5896 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5897}
5898
5899// This may be called multiple times, and nothing prevents creating multiple
5900// objects at the same offset. See if we already defined this object.
5902 int64_t Offset) {
5903 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5904 if (MFI.getObjectOffset(I) == Offset) {
5905 assert(MFI.getObjectSize(I) == Size);
5906 return I;
5907 }
5908 }
5909
5910 return MFI.CreateFixedObject(Size, Offset, true);
5911}
5912
5914 EVT VT,
5915 const SDLoc &SL,
5916 int64_t Offset) const {
5918 MachineFrameInfo &MFI = MF.getFrameInfo();
5919 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5920
5921 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5922 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5923
5924 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5927}
5928
5930 const SDLoc &SL,
5931 SDValue Chain,
5932 SDValue ArgVal,
5933 int64_t Offset) const {
5937
5938 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5939 // Stores to the argument stack area are relative to the stack pointer.
5940 SDValue SP =
5941 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5942 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5943 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5945 return Store;
5946}
5947
5949 const TargetRegisterClass *RC,
5950 EVT VT, const SDLoc &SL,
5951 const ArgDescriptor &Arg) const {
5952 assert(Arg && "Attempting to load missing argument");
5953
5954 SDValue V = Arg.isRegister() ?
5955 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5956 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5957
5958 if (!Arg.isMasked())
5959 return V;
5960
5961 unsigned Mask = Arg.getMask();
5962 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5963 V = DAG.getNode(ISD::SRL, SL, VT, V,
5964 DAG.getShiftAmountConstant(Shift, VT, SL));
5965 return DAG.getNode(ISD::AND, SL, VT, V,
5966 DAG.getConstant(Mask >> Shift, SL, VT));
5967}
5968
5970 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5971 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5972 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5973 uint64_t ArgOffset =
5974 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5975 switch (Param) {
5976 case FIRST_IMPLICIT:
5977 return ArgOffset;
5978 case PRIVATE_BASE:
5980 case SHARED_BASE:
5981 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5982 case QUEUE_PTR:
5983 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5984 }
5985 llvm_unreachable("unexpected implicit parameter type");
5986}
5987
5994
5996 SelectionDAG &DAG, int Enabled,
5997 int &RefinementSteps,
5998 bool &UseOneConstNR,
5999 bool Reciprocal) const {
6000 EVT VT = Operand.getValueType();
6001
6002 if (VT == MVT::f32) {
6003 RefinementSteps = 0;
6004 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
6005 }
6006
6007 // TODO: There is also f64 rsq instruction, but the documentation is less
6008 // clear on its precision.
6009
6010 return SDValue();
6011}
6012
6014 SelectionDAG &DAG, int Enabled,
6015 int &RefinementSteps) const {
6016 EVT VT = Operand.getValueType();
6017
6018 if (VT == MVT::f32) {
6019 // Reciprocal, < 1 ulp error.
6020 //
6021 // This reciprocal approximation converges to < 0.5 ulp error with one
6022 // newton rhapson performed with two fused multiple adds (FMAs).
6023
6024 RefinementSteps = 0;
6025 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
6026 }
6027
6028 // TODO: There is also f64 rcp instruction, but the documentation is less
6029 // clear on its precision.
6030
6031 return SDValue();
6032}
6033
6034static unsigned workitemIntrinsicDim(unsigned ID) {
6035 switch (ID) {
6036 case Intrinsic::amdgcn_workitem_id_x:
6037 return 0;
6038 case Intrinsic::amdgcn_workitem_id_y:
6039 return 1;
6040 case Intrinsic::amdgcn_workitem_id_z:
6041 return 2;
6042 default:
6043 llvm_unreachable("not a workitem intrinsic");
6044 }
6045}
6046
6048 const SDValue Op, KnownBits &Known,
6049 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
6050
6051 Known.resetAll(); // Don't know anything.
6052
6053 unsigned Opc = Op.getOpcode();
6054
6055 switch (Opc) {
6056 default:
6057 break;
6058 case AMDGPUISD::CARRY:
6059 case AMDGPUISD::BORROW: {
6060 Known.Zero = APInt::getHighBitsSet(32, 31);
6061 break;
6062 }
6063
6064 case AMDGPUISD::BFE_I32:
6065 case AMDGPUISD::BFE_U32: {
6066 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6067 if (!CWidth)
6068 return;
6069
6070 uint32_t Width = CWidth->getZExtValue() & 0x1f;
6071
6072 if (Opc == AMDGPUISD::BFE_U32)
6073 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
6074
6075 break;
6076 }
6077 case AMDGPUISD::FP_TO_FP16: {
6078 unsigned BitWidth = Known.getBitWidth();
6079
6080 // High bits are zero.
6082 break;
6083 }
6084 case AMDGPUISD::MUL_U24:
6085 case AMDGPUISD::MUL_I24: {
6086 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6087 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6088 unsigned BitWidth = Op.getScalarValueSizeInBits();
6089
6090 // Sign/Zero extend from 24 bits.
6091 if (Opc == AMDGPUISD::MUL_I24) {
6092 LHSKnown = LHSKnown.trunc(24).sext(BitWidth);
6093 RHSKnown = RHSKnown.trunc(24).sext(BitWidth);
6094 } else {
6095 LHSKnown = LHSKnown.trunc(24).zext(BitWidth);
6096 RHSKnown = RHSKnown.trunc(24).zext(BitWidth);
6097 }
6098
6099 // TODO: SelfMultiply can be poison, but not undef.
6100 bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
6101 if (SelfMultiply)
6102 SelfMultiply &= DAG.isGuaranteedNotToBeUndefOrPoison(
6103 Op.getOperand(0), DemandedElts, UndefPoisonKind::UndefOrPoison,
6104 Depth + 1);
6105
6106 Known = KnownBits::mul(LHSKnown, RHSKnown, SelfMultiply);
6107 break;
6108 }
6109 case AMDGPUISD::PERM: {
6110 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6111 if (!CMask)
6112 return;
6113
6114 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6115 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6116 unsigned Sel = CMask->getZExtValue();
6117
6118 for (unsigned I = 0; I < 32; I += 8) {
6119 unsigned SelBits = Sel & 0xff;
6120 if (SelBits < 4) {
6121 SelBits *= 8;
6122 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6123 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6124 } else if (SelBits < 7) {
6125 SelBits = (SelBits & 3) * 8;
6126 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6127 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6128 } else if (SelBits == 0x0c) {
6129 Known.Zero |= 0xFFull << I;
6130 } else if (SelBits > 0x0c) {
6131 Known.One |= 0xFFull << I;
6132 }
6133 Sel >>= 8;
6134 }
6135 break;
6136 }
6137 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6138 Known.Zero.setHighBits(24);
6139 break;
6140 }
6141 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6142 Known.Zero.setHighBits(16);
6143 break;
6144 }
6145 case AMDGPUISD::LDS: {
6146 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
6147 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
6148
6149 Known.Zero.setHighBits(16);
6150 Known.Zero.setLowBits(Log2(Alignment));
6151 break;
6152 }
6153 case AMDGPUISD::SMIN3:
6154 case AMDGPUISD::SMAX3:
6155 case AMDGPUISD::SMED3:
6156 case AMDGPUISD::UMIN3:
6157 case AMDGPUISD::UMAX3:
6158 case AMDGPUISD::UMED3: {
6159 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6160 if (Known2.isUnknown())
6161 break;
6162
6163 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6164 if (Known1.isUnknown())
6165 break;
6166
6167 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6168 if (Known0.isUnknown())
6169 break;
6170
6171 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6172 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6173 Known.One = Known0.One & Known1.One & Known2.One;
6174 break;
6175 }
6177 unsigned IID = Op.getConstantOperandVal(0);
6178 switch (IID) {
6179 case Intrinsic::amdgcn_workitem_id_x:
6180 case Intrinsic::amdgcn_workitem_id_y:
6181 case Intrinsic::amdgcn_workitem_id_z: {
6182 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6184 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6185 break;
6186 }
6187 default:
6188 break;
6189 }
6190 }
6191 }
6192}
6193
6195 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6196 unsigned Depth) const {
6197 switch (Op.getOpcode()) {
6198 case AMDGPUISD::BFE_I32: {
6199 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6200 if (!Width)
6201 return 1;
6202
6203 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6204 if (!isNullConstant(Op.getOperand(1)))
6205 return SignBits;
6206
6207 // TODO: Could probably figure something out with non-0 offsets.
6208 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6209 return std::max(SignBits, Op0SignBits);
6210 }
6211
6212 case AMDGPUISD::BFE_U32: {
6213 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6214 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6215 }
6216
6217 case AMDGPUISD::CARRY:
6218 case AMDGPUISD::BORROW:
6219 return 31;
6220 case AMDGPUISD::BUFFER_LOAD_BYTE:
6221 return 25;
6222 case AMDGPUISD::BUFFER_LOAD_SHORT:
6223 return 17;
6224 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6225 return 24;
6226 case AMDGPUISD::BUFFER_LOAD_USHORT:
6227 return 16;
6228 case AMDGPUISD::FP_TO_FP16:
6229 return 16;
6230 case AMDGPUISD::SMIN3:
6231 case AMDGPUISD::SMAX3:
6232 case AMDGPUISD::SMED3:
6233 case AMDGPUISD::UMIN3:
6234 case AMDGPUISD::UMAX3:
6235 case AMDGPUISD::UMED3: {
6236 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6237 if (Tmp2 == 1)
6238 return 1; // Early out.
6239
6240 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6241 if (Tmp1 == 1)
6242 return 1; // Early out.
6243
6244 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6245 if (Tmp0 == 1)
6246 return 1; // Early out.
6247
6248 return std::min({Tmp0, Tmp1, Tmp2});
6249 }
6250 default:
6251 return 1;
6252 }
6253}
6254
6256 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6257 const MachineRegisterInfo &MRI, unsigned Depth) const {
6258 const MachineInstr *MI = MRI.getVRegDef(R);
6259 if (!MI)
6260 return 1;
6261
6262 // TODO: Check range metadata on MMO.
6263 switch (MI->getOpcode()) {
6264 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6265 return 25;
6266 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6267 return 17;
6268 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6269 return 24;
6270 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6271 return 16;
6272 case AMDGPU::G_AMDGPU_SMED3:
6273 case AMDGPU::G_AMDGPU_UMED3: {
6274 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6275 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6276 if (Tmp2 == 1)
6277 return 1;
6278 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6279 if (Tmp1 == 1)
6280 return 1;
6281 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6282 if (Tmp0 == 1)
6283 return 1;
6284 return std::min({Tmp0, Tmp1, Tmp2});
6285 }
6286 default:
6287 return 1;
6288 }
6289}
6290
6292 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6293 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
6294 unsigned Opcode = Op.getOpcode();
6295 switch (Opcode) {
6296 case AMDGPUISD::BFE_I32:
6297 case AMDGPUISD::BFE_U32:
6298 return false;
6299 }
6301 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
6302}
6303
6305 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6306 unsigned Depth) const {
6307 unsigned Opcode = Op.getOpcode();
6308 switch (Opcode) {
6309 case AMDGPUISD::FMIN_LEGACY:
6310 case AMDGPUISD::FMAX_LEGACY: {
6311 if (SNaN)
6312 return true;
6313
6314 // TODO: Can check no nans on one of the operands for each one, but which
6315 // one?
6316 return false;
6317 }
6318 case AMDGPUISD::FMUL_LEGACY:
6319 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6320 if (SNaN)
6321 return true;
6322 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6323 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6324 }
6325 case AMDGPUISD::FMED3:
6326 case AMDGPUISD::FMIN3:
6327 case AMDGPUISD::FMAX3:
6328 case AMDGPUISD::FMINIMUM3:
6329 case AMDGPUISD::FMAXIMUM3:
6330 case AMDGPUISD::FMAD_FTZ: {
6331 if (SNaN)
6332 return true;
6333 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6334 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6335 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6336 }
6337 case AMDGPUISD::CVT_F32_UBYTE0:
6338 case AMDGPUISD::CVT_F32_UBYTE1:
6339 case AMDGPUISD::CVT_F32_UBYTE2:
6340 case AMDGPUISD::CVT_F32_UBYTE3:
6341 return true;
6342
6343 case AMDGPUISD::RCP:
6344 case AMDGPUISD::RSQ:
6345 case AMDGPUISD::RCP_LEGACY:
6346 case AMDGPUISD::RSQ_CLAMP: {
6347 if (SNaN)
6348 return true;
6349
6350 // TODO: Need is known positive check.
6351 return false;
6352 }
6353 case ISD::FLDEXP:
6354 case AMDGPUISD::FRACT: {
6355 if (SNaN)
6356 return true;
6357 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6358 }
6359 case AMDGPUISD::DIV_SCALE:
6360 case AMDGPUISD::DIV_FMAS:
6361 case AMDGPUISD::DIV_FIXUP:
6362 // TODO: Refine on operands.
6363 return SNaN;
6364 case AMDGPUISD::SIN_HW:
6365 case AMDGPUISD::COS_HW: {
6366 // TODO: Need check for infinity
6367 return SNaN;
6368 }
6370 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6371 // TODO: Handle more intrinsics
6372 switch (IntrinsicID) {
6373 case Intrinsic::amdgcn_cubeid:
6374 case Intrinsic::amdgcn_cvt_off_f32_i4:
6375 return true;
6376
6377 case Intrinsic::amdgcn_frexp_mant: {
6378 if (SNaN)
6379 return true;
6380 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6381 }
6382 case Intrinsic::amdgcn_cvt_pkrtz: {
6383 if (SNaN)
6384 return true;
6385 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6386 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6387 }
6388 case Intrinsic::amdgcn_rcp:
6389 case Intrinsic::amdgcn_rsq:
6390 case Intrinsic::amdgcn_rcp_legacy:
6391 case Intrinsic::amdgcn_rsq_legacy:
6392 case Intrinsic::amdgcn_rsq_clamp:
6393 case Intrinsic::amdgcn_tanh: {
6394 if (SNaN)
6395 return true;
6396
6397 // TODO: Need is known positive check.
6398 return false;
6399 }
6400 case Intrinsic::amdgcn_trig_preop:
6401 case Intrinsic::amdgcn_fdot2:
6402 // TODO: Refine on operand
6403 return SNaN;
6404 case Intrinsic::amdgcn_fma_legacy:
6405 if (SNaN)
6406 return true;
6407 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6408 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6409 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6410 default:
6411 return false;
6412 }
6413 }
6414 default:
6415 return false;
6416 }
6417}
6418
6420 Register N0, Register N1) const {
6421 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6422}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:317
#define LLVM_READONLY
Definition Compiler.h:324
Provides analysis for querying information about KnownBits during GISel passes.
const HexagonInstrInfo * TII
static MaybeAlign getAlign(Value *Ptr)
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue lowerFEXPF64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const
Check whether value Val can be supported by v_mov_b64, for the current target.
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:297
static const fltSemantics & IEEEdouble()
Definition APFloat.h:298
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:345
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1509
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1246
const fltSemantics & getSemantics() const
Definition APFloat.h:1552
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1264
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1223
APInt bitcastToAPInt() const
Definition APFloat.h:1436
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1163
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition APInt.h:1189
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1411
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
const BlockAddress * getBlockAddress() const
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:358
iterator_range< arg_iterator > args()
Definition Function.h:892
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, UndefPoisonKind Kind=UndefPoisonKind::UndefOrPoison, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, Kind can be used to track poison ...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:800
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:819
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:813
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:795
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:953
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1672
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:508
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:494
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:266
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:453
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:501
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:435
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:442
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:315
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:150
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...