LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
343 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
345 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
347 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
348
349 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
351 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
352
353 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
355 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
356
357 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
358
359 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
362 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
365 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
366
367 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
368 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
371 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
372
373 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
375 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
376
377 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
379 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
380
381 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
383 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
384
385 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
387 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
388
389 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
391 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
395 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
396
397 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
398 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
399
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
410 {MVT::f16, MVT::f32}, Legal);
412
414 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
416 {MVT::f16, MVT::f32, MVT::f64}, Expand);
417
420 Custom);
421
422 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
423
424 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
425
426 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
427 Expand);
428
429 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
430 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
432
434 Custom);
435
436 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
437
438 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
439 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
440 // default unless marked custom/legal.
442 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
443 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
444 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
445 MVT::v16f64},
446 Custom);
447
448 // Expand to fneg + fadd.
450
452 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
453 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
454 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
455 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
456 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
457 Custom);
458
461 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
462 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
463 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
464 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
465 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
466 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
467 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
468 Custom);
469
471 Expand);
472 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
473
474 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
475 for (MVT VT : ScalarIntVTs) {
476 // These should use [SU]DIVREM, so set them to expand
478 Expand);
479
480 // GPU does not have divrem function for signed or unsigned.
482
483 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
485
487
488 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
490 }
491
492 // The hardware supports 32-bit FSHR, but not FSHL.
494
495 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
496
498
502 MVT::i64, Custom);
504
506 Legal);
507
510 MVT::i64, Custom);
511
512 for (auto VT : {MVT::i8, MVT::i16})
514
515 static const MVT::SimpleValueType VectorIntTypes[] = {
516 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
517 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
518
519 for (MVT VT : VectorIntTypes) {
520 // Expand the following operations for the current type by default.
533 VT, Expand);
534 }
535
536 static const MVT::SimpleValueType FloatVectorTypes[] = {
537 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
538 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
539
540 for (MVT VT : FloatVectorTypes) {
553 VT, Expand);
554 }
555
556 // This causes using an unrolled select operation rather than expansion with
557 // bit operations. This is in general better, but the alternative using BFI
558 // instructions may be better if the select sources are SGPRs.
560 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
561
563 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
564
566 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
567
569 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
570
572 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
582
584 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
585
587 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
588
590 setJumpIsExpensive(true);
591
594
596
597 // We want to find all load dependencies for long chains of stores to enable
598 // merging into very wide vectors. The problem is with vectors with > 4
599 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
600 // vectors are a legal type, even though we have to split the loads
601 // usually. When we can more precisely specify load legality per address
602 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
603 // smarter so that they can figure out what to do in 2 iterations without all
604 // N > 4 stores on the same chain.
606
607 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
608 // about these during lowering.
609 MaxStoresPerMemcpy = 0xffffffff;
610 MaxStoresPerMemmove = 0xffffffff;
611 MaxStoresPerMemset = 0xffffffff;
612
613 // The expansion for 64-bit division is enormous.
615 addBypassSlowDiv(64, 32);
616
627
631}
632
634 if (getTargetMachine().Options.NoSignedZerosFPMath)
635 return true;
636
637 const auto Flags = Op.getNode()->getFlags();
638 if (Flags.hasNoSignedZeros())
639 return true;
640
641 return false;
642}
643
644//===----------------------------------------------------------------------===//
645// Target Information
646//===----------------------------------------------------------------------===//
647
649static bool fnegFoldsIntoOpcode(unsigned Opc) {
650 switch (Opc) {
651 case ISD::FADD:
652 case ISD::FSUB:
653 case ISD::FMUL:
654 case ISD::FMA:
655 case ISD::FMAD:
656 case ISD::FMINNUM:
657 case ISD::FMAXNUM:
660 case ISD::FMINIMUM:
661 case ISD::FMAXIMUM:
662 case ISD::FMINIMUMNUM:
663 case ISD::FMAXIMUMNUM:
664 case ISD::SELECT:
665 case ISD::FSIN:
666 case ISD::FTRUNC:
667 case ISD::FRINT:
668 case ISD::FNEARBYINT:
669 case ISD::FROUNDEVEN:
671 case AMDGPUISD::RCP:
672 case AMDGPUISD::RCP_LEGACY:
673 case AMDGPUISD::RCP_IFLAG:
674 case AMDGPUISD::SIN_HW:
675 case AMDGPUISD::FMUL_LEGACY:
676 case AMDGPUISD::FMIN_LEGACY:
677 case AMDGPUISD::FMAX_LEGACY:
678 case AMDGPUISD::FMED3:
679 // TODO: handle llvm.amdgcn.fma.legacy
680 return true;
681 case ISD::BITCAST:
682 llvm_unreachable("bitcast is special cased");
683 default:
684 return false;
685 }
686}
687
688static bool fnegFoldsIntoOp(const SDNode *N) {
689 unsigned Opc = N->getOpcode();
690 if (Opc == ISD::BITCAST) {
691 // TODO: Is there a benefit to checking the conditions performFNegCombine
692 // does? We don't for the other cases.
693 SDValue BCSrc = N->getOperand(0);
694 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
695 return BCSrc.getNumOperands() == 2 &&
696 BCSrc.getOperand(1).getValueSizeInBits() == 32;
697 }
698
699 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
700 }
701
702 return fnegFoldsIntoOpcode(Opc);
703}
704
705/// \p returns true if the operation will definitely need to use a 64-bit
706/// encoding, and thus will use a VOP3 encoding regardless of the source
707/// modifiers.
709static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
710 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
711 VT == MVT::f64;
712}
713
714/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
715/// type for ISD::SELECT.
717static bool selectSupportsSourceMods(const SDNode *N) {
718 // TODO: Only applies if select will be vector
719 return N->getValueType(0) == MVT::f32;
720}
721
722// Most FP instructions support source modifiers, but this could be refined
723// slightly.
725static bool hasSourceMods(const SDNode *N) {
726 if (isa<MemSDNode>(N))
727 return false;
728
729 switch (N->getOpcode()) {
730 case ISD::CopyToReg:
731 case ISD::FDIV:
732 case ISD::FREM:
733 case ISD::INLINEASM:
735 case AMDGPUISD::DIV_SCALE:
737
738 // TODO: Should really be looking at the users of the bitcast. These are
739 // problematic because bitcasts are used to legalize all stores to integer
740 // types.
741 case ISD::BITCAST:
742 return false;
744 switch (N->getConstantOperandVal(0)) {
745 case Intrinsic::amdgcn_interp_p1:
746 case Intrinsic::amdgcn_interp_p2:
747 case Intrinsic::amdgcn_interp_mov:
748 case Intrinsic::amdgcn_interp_p1_f16:
749 case Intrinsic::amdgcn_interp_p2_f16:
750 return false;
751 default:
752 return true;
753 }
754 }
755 case ISD::SELECT:
757 default:
758 return true;
759 }
760}
761
763 unsigned CostThreshold) {
764 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
765 // it is truly free to use a source modifier in all cases. If there are
766 // multiple users but for each one will necessitate using VOP3, there will be
767 // a code size increase. Try to avoid increasing code size unless we know it
768 // will save on the instruction count.
769 unsigned NumMayIncreaseSize = 0;
770 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
771
772 assert(!N->use_empty());
773
774 // XXX - Should this limit number of uses to check?
775 for (const SDNode *U : N->users()) {
776 if (!hasSourceMods(U))
777 return false;
778
779 if (!opMustUseVOP3Encoding(U, VT)) {
780 if (++NumMayIncreaseSize > CostThreshold)
781 return false;
782 }
783 }
784
785 return true;
786}
787
789 ISD::NodeType ExtendKind) const {
790 assert(!VT.isVector() && "only scalar expected");
791
792 // Round to the next multiple of 32-bits.
793 unsigned Size = VT.getSizeInBits();
794 if (Size <= 32)
795 return MVT::i32;
796 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
797}
798
800 return 32;
801}
802
804 return true;
805}
806
807// The backend supports 32 and 64 bit floating point immediates.
808// FIXME: Why are we reporting vectors of FP immediates as legal?
810 bool ForCodeSize) const {
811 return isTypeLegal(VT.getScalarType());
812}
813
814// We don't want to shrink f64 / f32 constants.
816 EVT ScalarVT = VT.getScalarType();
817 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
818}
819
821 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
822 std::optional<unsigned> ByteOffset) const {
823 // TODO: This may be worth removing. Check regression tests for diffs.
824 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
825 return false;
826
827 unsigned NewSize = NewVT.getStoreSizeInBits();
828
829 // If we are reducing to a 32-bit load or a smaller multi-dword load,
830 // this is always better.
831 if (NewSize >= 32)
832 return true;
833
834 EVT OldVT = N->getValueType(0);
835 unsigned OldSize = OldVT.getStoreSizeInBits();
836
838 unsigned AS = MN->getAddressSpace();
839 // Do not shrink an aligned scalar load to sub-dword.
840 // Scalar engine cannot do sub-dword loads.
841 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
842 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
846 MN->isInvariant())) &&
848 return false;
849
850 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
851 // extloads, so doing one requires using a buffer_load. In cases where we
852 // still couldn't use a scalar load, using the wider load shouldn't really
853 // hurt anything.
854
855 // If the old size already had to be an extload, there's no harm in continuing
856 // to reduce the width.
857 return (OldSize < 32);
858}
859
861 const SelectionDAG &DAG,
862 const MachineMemOperand &MMO) const {
863
864 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
865
866 if (LoadTy.getScalarType() == MVT::i32)
867 return false;
868
869 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
870 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
871
872 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
873 return false;
874
875 unsigned Fast = 0;
877 CastTy, MMO, &Fast) &&
878 Fast;
879}
880
881// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
882// profitable with the expansion for 64-bit since it's generally good to
883// speculate things.
885 return true;
886}
887
889 return true;
890}
891
893 switch (N->getOpcode()) {
894 case ISD::EntryToken:
895 case ISD::TokenFactor:
896 return true;
898 unsigned IntrID = N->getConstantOperandVal(0);
900 }
902 unsigned IntrID = N->getConstantOperandVal(1);
904 }
905 case ISD::LOAD:
906 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
908 return true;
909 return false;
910 case AMDGPUISD::SETCC: // ballot-style instruction
911 return true;
912 }
913 return false;
914}
915
917 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
918 NegatibleCost &Cost, unsigned Depth) const {
919
920 switch (Op.getOpcode()) {
921 case ISD::FMA:
922 case ISD::FMAD: {
923 // Negating a fma is not free if it has users without source mods.
924 if (!allUsesHaveSourceMods(Op.getNode()))
925 return SDValue();
926 break;
927 }
928 case AMDGPUISD::RCP: {
929 SDValue Src = Op.getOperand(0);
930 EVT VT = Op.getValueType();
931 SDLoc SL(Op);
932
933 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
934 ForCodeSize, Cost, Depth + 1);
935 if (NegSrc)
936 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
937 return SDValue();
938 }
939 default:
940 break;
941 }
942
943 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
944 ForCodeSize, Cost, Depth);
945}
946
947//===---------------------------------------------------------------------===//
948// Target Properties
949//===---------------------------------------------------------------------===//
950
953
954 // Packed operations do not have a fabs modifier.
955 // Report this based on the end legalized type.
956 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
957}
958
961 // Report this based on the end legalized type.
962 VT = VT.getScalarType();
963 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
964}
965
967 unsigned NumElem,
968 unsigned AS) const {
969 return true;
970}
971
973 // There are few operations which truly have vector input operands. Any vector
974 // operation is going to involve operations on each component, and a
975 // build_vector will be a copy per element, so it always makes sense to use a
976 // build_vector input in place of the extracted element to avoid a copy into a
977 // super register.
978 //
979 // We should probably only do this if all users are extracts only, but this
980 // should be the common case.
981 return true;
982}
983
985 // Truncate is just accessing a subregister.
986
987 unsigned SrcSize = Source.getSizeInBits();
988 unsigned DestSize = Dest.getSizeInBits();
989
990 return DestSize < SrcSize && DestSize % 32 == 0 ;
991}
992
994 // Truncate is just accessing a subregister.
995
996 unsigned SrcSize = Source->getScalarSizeInBits();
997 unsigned DestSize = Dest->getScalarSizeInBits();
998
999 if (DestSize== 16 && Subtarget->has16BitInsts())
1000 return SrcSize >= 32;
1001
1002 return DestSize < SrcSize && DestSize % 32 == 0;
1003}
1004
1006 unsigned SrcSize = Src->getScalarSizeInBits();
1007 unsigned DestSize = Dest->getScalarSizeInBits();
1008
1009 if (SrcSize == 16 && Subtarget->has16BitInsts())
1010 return DestSize >= 32;
1011
1012 return SrcSize == 32 && DestSize == 64;
1013}
1014
1016 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1017 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1018 // this will enable reducing 64-bit operations the 32-bit, which is always
1019 // good.
1020
1021 if (Src == MVT::i16)
1022 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1023
1024 return Src == MVT::i32 && Dest == MVT::i64;
1025}
1026
1028 EVT DestVT) const {
1029 switch (N->getOpcode()) {
1030 case ISD::ADD:
1031 case ISD::SUB:
1032 case ISD::SHL:
1033 case ISD::SRL:
1034 case ISD::SRA:
1035 case ISD::AND:
1036 case ISD::OR:
1037 case ISD::XOR:
1038 case ISD::MUL:
1039 case ISD::SETCC:
1040 case ISD::SELECT:
1041 case ISD::SMIN:
1042 case ISD::SMAX:
1043 case ISD::UMIN:
1044 case ISD::UMAX:
1045 if (isTypeLegal(MVT::i16) &&
1046 (!DestVT.isVector() ||
1047 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1048 // Don't narrow back down to i16 if promoted to i32 already.
1049 if (!N->isDivergent() && DestVT.isInteger() &&
1050 DestVT.getScalarSizeInBits() > 1 &&
1051 DestVT.getScalarSizeInBits() <= 16 &&
1052 SrcVT.getScalarSizeInBits() > 16) {
1053 return false;
1054 }
1055 }
1056 return true;
1057 default:
1058 break;
1059 }
1060
1061 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1062 // limited number of native 64-bit operations. Shrinking an operation to fit
1063 // in a single 32-bit register should always be helpful. As currently used,
1064 // this is much less general than the name suggests, and is only used in
1065 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1066 // not profitable, and may actually be harmful.
1067 if (isa<LoadSDNode>(N))
1068 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1069
1070 return true;
1071}
1072
1074 const SDNode* N, CombineLevel Level) const {
1075 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1076 N->getOpcode() == ISD::SRL) &&
1077 "Expected shift op");
1078
1079 SDValue ShiftLHS = N->getOperand(0);
1080 if (!ShiftLHS->hasOneUse())
1081 return false;
1082
1083 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1084 !ShiftLHS.getOperand(0)->hasOneUse())
1085 return false;
1086
1087 // Always commute pre-type legalization and right shifts.
1088 // We're looking for shl(or(x,y),z) patterns.
1090 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1091 return true;
1092
1093 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1094 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1095 (N->user_begin()->getOpcode() == ISD::SRA ||
1096 N->user_begin()->getOpcode() == ISD::SRL))
1097 return false;
1098
1099 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1100 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1101 if (LHS.getOpcode() != ISD::SHL)
1102 return false;
1103 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1104 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1105 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1106 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1107 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1108 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1109 };
1110 SDValue LHS = N->getOperand(0).getOperand(0);
1111 SDValue RHS = N->getOperand(0).getOperand(1);
1112 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1113}
1114
1115//===---------------------------------------------------------------------===//
1116// TargetLowering Callbacks
1117//===---------------------------------------------------------------------===//
1118
1120 bool IsVarArg) {
1121 switch (CC) {
1129 return CC_AMDGPU;
1132 return CC_AMDGPU_CS_CHAIN;
1133 case CallingConv::C:
1134 case CallingConv::Fast:
1135 case CallingConv::Cold:
1136 return CC_AMDGPU_Func;
1139 return CC_SI_Gfx;
1142 default:
1143 reportFatalUsageError("unsupported calling convention for call");
1144 }
1145}
1146
1148 bool IsVarArg) {
1149 switch (CC) {
1152 llvm_unreachable("kernels should not be handled here");
1162 return RetCC_SI_Shader;
1165 return RetCC_SI_Gfx;
1166 case CallingConv::C:
1167 case CallingConv::Fast:
1168 case CallingConv::Cold:
1169 return RetCC_AMDGPU_Func;
1170 default:
1171 reportFatalUsageError("unsupported calling convention");
1172 }
1173}
1174
1175/// The SelectionDAGBuilder will automatically promote function arguments
1176/// with illegal types. However, this does not work for the AMDGPU targets
1177/// since the function arguments are stored in memory as these illegal types.
1178/// In order to handle this properly we need to get the original types sizes
1179/// from the LLVM IR Function and fixup the ISD:InputArg values before
1180/// passing them to AnalyzeFormalArguments()
1181
1182/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1183/// input values across multiple registers. Each item in the Ins array
1184/// represents a single value that will be stored in registers. Ins[x].VT is
1185/// the value type of the value that will be stored in the register, so
1186/// whatever SDNode we lower the argument to needs to be this type.
1187///
1188/// In order to correctly lower the arguments we need to know the size of each
1189/// argument. Since Ins[x].VT gives us the size of the register that will
1190/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1191/// for the original function argument so that we can deduce the correct memory
1192/// type to use for Ins[x]. In most cases the correct memory type will be
1193/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1194/// we have a kernel argument of type v8i8, this argument will be split into
1195/// 8 parts and each part will be represented by its own item in the Ins array.
1196/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1197/// the argument before it was split. From this, we deduce that the memory type
1198/// for each individual part is i8. We pass the memory type as LocVT to the
1199/// calling convention analysis function and the register type (Ins[x].VT) as
1200/// the ValVT.
1202 CCState &State,
1203 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1204 const MachineFunction &MF = State.getMachineFunction();
1205 const Function &Fn = MF.getFunction();
1206 LLVMContext &Ctx = Fn.getContext();
1207 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1209
1210 Align MaxAlign = Align(1);
1211 uint64_t ExplicitArgOffset = 0;
1212 const DataLayout &DL = Fn.getDataLayout();
1213
1214 unsigned InIndex = 0;
1215
1216 for (const Argument &Arg : Fn.args()) {
1217 const bool IsByRef = Arg.hasByRefAttr();
1218 Type *BaseArgTy = Arg.getType();
1219 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1220 Align Alignment = DL.getValueOrABITypeAlignment(
1221 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1222 MaxAlign = std::max(Alignment, MaxAlign);
1223 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1224
1225 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1226 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1227
1228 // We're basically throwing away everything passed into us and starting over
1229 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1230 // to us as computed in Ins.
1231 //
1232 // We also need to figure out what type legalization is trying to do to get
1233 // the correct memory offsets.
1234
1235 SmallVector<EVT, 16> ValueVTs;
1237 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1238 &Offsets, ArgOffset);
1239
1240 for (unsigned Value = 0, NumValues = ValueVTs.size();
1241 Value != NumValues; ++Value) {
1242 uint64_t BasePartOffset = Offsets[Value];
1243
1244 EVT ArgVT = ValueVTs[Value];
1245 EVT MemVT = ArgVT;
1246 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1247 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1248
1249 if (NumRegs == 1) {
1250 // This argument is not split, so the IR type is the memory type.
1251 if (ArgVT.isExtended()) {
1252 // We have an extended type, like i24, so we should just use the
1253 // register type.
1254 MemVT = RegisterVT;
1255 } else {
1256 MemVT = ArgVT;
1257 }
1258 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1259 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1260 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1261 // We have a vector value which has been split into a vector with
1262 // the same scalar type, but fewer elements. This should handle
1263 // all the floating-point vector types.
1264 MemVT = RegisterVT;
1265 } else if (ArgVT.isVector() &&
1266 ArgVT.getVectorNumElements() == NumRegs) {
1267 // This arg has been split so that each element is stored in a separate
1268 // register.
1269 MemVT = ArgVT.getScalarType();
1270 } else if (ArgVT.isExtended()) {
1271 // We have an extended type, like i65.
1272 MemVT = RegisterVT;
1273 } else {
1274 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1275 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1276 if (RegisterVT.isInteger()) {
1277 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1278 } else if (RegisterVT.isVector()) {
1279 assert(!RegisterVT.getScalarType().isFloatingPoint());
1280 unsigned NumElements = RegisterVT.getVectorNumElements();
1281 assert(MemoryBits % NumElements == 0);
1282 // This vector type has been split into another vector type with
1283 // a different elements size.
1284 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1285 MemoryBits / NumElements);
1286 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1287 } else {
1288 llvm_unreachable("cannot deduce memory type.");
1289 }
1290 }
1291
1292 // Convert one element vectors to scalar.
1293 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1294 MemVT = MemVT.getScalarType();
1295
1296 // Round up vec3/vec5 argument.
1297 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1298 MemVT = MemVT.getPow2VectorType(State.getContext());
1299 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1300 MemVT = MemVT.getRoundIntegerType(State.getContext());
1301 }
1302
1303 unsigned PartOffset = 0;
1304 for (unsigned i = 0; i != NumRegs; ++i) {
1305 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1306 BasePartOffset + PartOffset,
1307 MemVT.getSimpleVT(),
1309 PartOffset += MemVT.getStoreSize();
1310 }
1311 }
1312 }
1313}
1314
1316 SDValue Chain, CallingConv::ID CallConv,
1317 bool isVarArg,
1319 const SmallVectorImpl<SDValue> &OutVals,
1320 const SDLoc &DL, SelectionDAG &DAG) const {
1321 // FIXME: Fails for r600 tests
1322 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1323 // "wave terminate should not have return values");
1324 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1325}
1326
1327//===---------------------------------------------------------------------===//
1328// Target specific lowering
1329//===---------------------------------------------------------------------===//
1330
1331/// Selects the correct CCAssignFn for a given CallingConvention value.
1336
1341
1343 SelectionDAG &DAG,
1344 MachineFrameInfo &MFI,
1345 int ClobberedFI) const {
1346 SmallVector<SDValue, 8> ArgChains;
1347 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1348 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1349
1350 // Include the original chain at the beginning of the list. When this is
1351 // used by target LowerCall hooks, this helps legalize find the
1352 // CALLSEQ_BEGIN node.
1353 ArgChains.push_back(Chain);
1354
1355 // Add a chain value for each stack argument corresponding
1356 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1357 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1358 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1359 if (FI->getIndex() < 0) {
1360 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1361 int64_t InLastByte = InFirstByte;
1362 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1363
1364 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1365 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1366 ArgChains.push_back(SDValue(L, 1));
1367 }
1368 }
1369 }
1370 }
1371
1372 // Build a tokenfactor for all the chains.
1373 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1374}
1375
1378 StringRef Reason) const {
1379 SDValue Callee = CLI.Callee;
1380 SelectionDAG &DAG = CLI.DAG;
1381
1382 const Function &Fn = DAG.getMachineFunction().getFunction();
1383
1384 StringRef FuncName("<unknown>");
1385
1387 FuncName = G->getSymbol();
1388 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1389 FuncName = G->getGlobal()->getName();
1390
1391 DAG.getContext()->diagnose(
1392 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1393
1394 if (!CLI.IsTailCall) {
1395 for (ISD::InputArg &Arg : CLI.Ins)
1396 InVals.push_back(DAG.getPOISON(Arg.VT));
1397 }
1398
1399 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1400 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1401 return CLI.Chain;
1402
1403 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1404 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1405}
1406
1408 SmallVectorImpl<SDValue> &InVals) const {
1409 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1410}
1411
1413 SelectionDAG &DAG) const {
1414 const Function &Fn = DAG.getMachineFunction().getFunction();
1415
1417 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1418 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1419 return DAG.getMergeValues(Ops, SDLoc());
1420}
1421
1423 SelectionDAG &DAG) const {
1424 switch (Op.getOpcode()) {
1425 default:
1426 Op->print(errs(), &DAG);
1427 llvm_unreachable("Custom lowering code for this "
1428 "instruction is not implemented yet!");
1429 break;
1431 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1433 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1434 case ISD::SDIVREM:
1435 return LowerSDIVREM(Op, DAG);
1436 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1437 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1438 case ISD::FRINT: return LowerFRINT(Op, DAG);
1439 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1440 case ISD::FROUNDEVEN:
1441 return LowerFROUNDEVEN(Op, DAG);
1442 case ISD::FROUND: return LowerFROUND(Op, DAG);
1443 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1444 case ISD::FLOG2:
1445 return LowerFLOG2(Op, DAG);
1446 case ISD::FLOG:
1447 case ISD::FLOG10:
1448 return LowerFLOGCommon(Op, DAG);
1449 case ISD::FEXP:
1450 case ISD::FEXP10:
1451 return lowerFEXP(Op, DAG);
1452 case ISD::FEXP2:
1453 return lowerFEXP2(Op, DAG);
1454 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1455 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1456 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1457 case ISD::FP_TO_SINT:
1458 case ISD::FP_TO_UINT:
1459 return LowerFP_TO_INT(Op, DAG);
1460 case ISD::CTTZ:
1462 case ISD::CTLZ:
1464 return LowerCTLZ_CTTZ(Op, DAG);
1466 }
1467 return Op;
1468}
1469
1472 SelectionDAG &DAG) const {
1473 switch (N->getOpcode()) {
1475 // Different parts of legalization seem to interpret which type of
1476 // sign_extend_inreg is the one to check for custom lowering. The extended
1477 // from type is what really matters, but some places check for custom
1478 // lowering of the result type. This results in trying to use
1479 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1480 // nothing here and let the illegal result integer be handled normally.
1481 return;
1482 case ISD::FLOG2:
1483 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1484 Results.push_back(Lowered);
1485 return;
1486 case ISD::FLOG:
1487 case ISD::FLOG10:
1488 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1489 Results.push_back(Lowered);
1490 return;
1491 case ISD::FEXP2:
1492 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1493 Results.push_back(Lowered);
1494 return;
1495 case ISD::FEXP:
1496 case ISD::FEXP10:
1497 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1498 Results.push_back(Lowered);
1499 return;
1500 case ISD::CTLZ:
1502 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1503 Results.push_back(Lowered);
1504 return;
1505 default:
1506 return;
1507 }
1508}
1509
1511 SDValue Op,
1512 SelectionDAG &DAG) const {
1513
1514 const DataLayout &DL = DAG.getDataLayout();
1516 const GlobalValue *GV = G->getGlobal();
1517
1518 if (!MFI->isModuleEntryFunction()) {
1519 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1520 if (std::optional<uint32_t> Address =
1522 if (IsNamedBarrier) {
1523 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1524 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1525 }
1526 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1527 } else if (IsNamedBarrier) {
1528 llvm_unreachable("named barrier should have an assigned address");
1529 }
1530 }
1531
1532 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1533 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1534 if (!MFI->isModuleEntryFunction() &&
1535 GV->getName() != "llvm.amdgcn.module.lds" &&
1537 SDLoc DL(Op);
1538 const Function &Fn = DAG.getMachineFunction().getFunction();
1540 Fn, "local memory global used by non-kernel function",
1541 DL.getDebugLoc(), DS_Warning));
1542
1543 // We currently don't have a way to correctly allocate LDS objects that
1544 // aren't directly associated with a kernel. We do force inlining of
1545 // functions that use local objects. However, if these dead functions are
1546 // not eliminated, we don't want a compile time error. Just emit a warning
1547 // and a trap, since there should be no callable path here.
1548 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1549 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1550 Trap, DAG.getRoot());
1551 DAG.setRoot(OutputChain);
1552 return DAG.getPOISON(Op.getValueType());
1553 }
1554
1555 // XXX: What does the value of G->getOffset() mean?
1556 assert(G->getOffset() == 0 &&
1557 "Do not know what to do with an non-zero offset");
1558
1559 // TODO: We could emit code to handle the initialization somewhere.
1560 // We ignore the initializer for now and legalize it to allow selection.
1561 // The initializer will anyway get errored out during assembly emission.
1562 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1563 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1564 }
1565 return SDValue();
1566}
1567
1569 SelectionDAG &DAG) const {
1571 SDLoc SL(Op);
1572
1573 EVT VT = Op.getValueType();
1574 if (VT.getVectorElementType().getSizeInBits() < 32) {
1575 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1576 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1577 unsigned NewNumElt = OpBitSize / 32;
1578 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1580 MVT::i32, NewNumElt);
1581 for (const SDUse &U : Op->ops()) {
1582 SDValue In = U.get();
1583 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1584 if (NewNumElt > 1)
1585 DAG.ExtractVectorElements(NewIn, Args);
1586 else
1587 Args.push_back(NewIn);
1588 }
1589
1590 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1591 NewNumElt * Op.getNumOperands());
1592 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1593 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1594 }
1595 }
1596
1597 for (const SDUse &U : Op->ops())
1598 DAG.ExtractVectorElements(U.get(), Args);
1599
1600 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1601}
1602
1604 SelectionDAG &DAG) const {
1605 SDLoc SL(Op);
1607 unsigned Start = Op.getConstantOperandVal(1);
1608 EVT VT = Op.getValueType();
1609 EVT SrcVT = Op.getOperand(0).getValueType();
1610
1611 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1612 unsigned NumElt = VT.getVectorNumElements();
1613 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1614 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1615
1616 // Extract 32-bit registers at a time.
1617 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1618 EVT NewVT = NumElt == 2
1619 ? MVT::i32
1620 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1621 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1622
1623 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1624 if (NumElt == 2)
1625 Tmp = Args[0];
1626 else
1627 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1628
1629 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1630 }
1631
1632 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1634
1635 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1636}
1637
1638// TODO: Handle fabs too
1640 if (Val.getOpcode() == ISD::FNEG)
1641 return Val.getOperand(0);
1642
1643 return Val;
1644}
1645
1647 if (Val.getOpcode() == ISD::FNEG)
1648 Val = Val.getOperand(0);
1649 if (Val.getOpcode() == ISD::FABS)
1650 Val = Val.getOperand(0);
1651 if (Val.getOpcode() == ISD::FCOPYSIGN)
1652 Val = Val.getOperand(0);
1653 return Val;
1654}
1655
1657 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1658 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1659 SelectionDAG &DAG = DCI.DAG;
1660 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1661 switch (CCOpcode) {
1662 case ISD::SETOEQ:
1663 case ISD::SETONE:
1664 case ISD::SETUNE:
1665 case ISD::SETNE:
1666 case ISD::SETUEQ:
1667 case ISD::SETEQ:
1668 case ISD::SETFALSE:
1669 case ISD::SETFALSE2:
1670 case ISD::SETTRUE:
1671 case ISD::SETTRUE2:
1672 case ISD::SETUO:
1673 case ISD::SETO:
1674 break;
1675 case ISD::SETULE:
1676 case ISD::SETULT: {
1677 if (LHS == True)
1678 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1679 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1680 }
1681 case ISD::SETOLE:
1682 case ISD::SETOLT:
1683 case ISD::SETLE:
1684 case ISD::SETLT: {
1685 // Ordered. Assume ordered for undefined.
1686
1687 // Only do this after legalization to avoid interfering with other combines
1688 // which might occur.
1690 !DCI.isCalledByLegalizer())
1691 return SDValue();
1692
1693 // We need to permute the operands to get the correct NaN behavior. The
1694 // selected operand is the second one based on the failing compare with NaN,
1695 // so permute it based on the compare type the hardware uses.
1696 if (LHS == True)
1697 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1698 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1699 }
1700 case ISD::SETUGE:
1701 case ISD::SETUGT: {
1702 if (LHS == True)
1703 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1704 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1705 }
1706 case ISD::SETGT:
1707 case ISD::SETGE:
1708 case ISD::SETOGE:
1709 case ISD::SETOGT: {
1711 !DCI.isCalledByLegalizer())
1712 return SDValue();
1713
1714 if (LHS == True)
1715 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1716 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1717 }
1718 case ISD::SETCC_INVALID:
1719 llvm_unreachable("Invalid setcc condcode!");
1720 }
1721 return SDValue();
1722}
1723
1724/// Generate Min/Max node
1726 SDValue LHS, SDValue RHS,
1727 SDValue True, SDValue False,
1728 SDValue CC,
1729 DAGCombinerInfo &DCI) const {
1730 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1731 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1732
1733 SelectionDAG &DAG = DCI.DAG;
1734
1735 // If we can't directly match this, try to see if we can fold an fneg to
1736 // match.
1737
1740 SDValue NegTrue = peekFNeg(True);
1741
1742 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1743 // fmin/fmax.
1744 //
1745 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1746 // -> fneg (fmin_legacy lhs, K)
1747 //
1748 // TODO: Use getNegatedExpression
1749 if (LHS == NegTrue && CFalse && CRHS) {
1750 APFloat NegRHS = neg(CRHS->getValueAPF());
1751 if (NegRHS == CFalse->getValueAPF()) {
1752 SDValue Combined =
1753 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1754 if (Combined)
1755 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1756 return SDValue();
1757 }
1758 }
1759
1760 return SDValue();
1761}
1762
1763std::pair<SDValue, SDValue>
1765 SDLoc SL(Op);
1766
1767 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1768
1769 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1770 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1771
1772 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1773 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1774
1775 return std::pair(Lo, Hi);
1776}
1777
1779 SDLoc SL(Op);
1780
1781 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1782 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1783 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1784}
1785
1787 SDLoc SL(Op);
1788
1789 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1790 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1791 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1792}
1793
1794// Split a vector type into two parts. The first part is a power of two vector.
1795// The second part is whatever is left over, and is a scalar if it would
1796// otherwise be a 1-vector.
1797std::pair<EVT, EVT>
1799 EVT LoVT, HiVT;
1800 EVT EltVT = VT.getVectorElementType();
1801 unsigned NumElts = VT.getVectorNumElements();
1802 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1803 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1804 HiVT = NumElts - LoNumElts == 1
1805 ? EltVT
1806 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1807 return std::pair(LoVT, HiVT);
1808}
1809
1810// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1811// scalar.
1812std::pair<SDValue, SDValue>
1814 const EVT &LoVT, const EVT &HiVT,
1815 SelectionDAG &DAG) const {
1816 EVT VT = N.getValueType();
1818 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1819 VT.getVectorNumElements() &&
1820 "More vector elements requested than available!");
1822 DAG.getVectorIdxConstant(0, DL));
1823
1824 unsigned LoNumElts = LoVT.getVectorNumElements();
1825
1826 if (HiVT.isVector()) {
1827 unsigned HiNumElts = HiVT.getVectorNumElements();
1828 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1829 // Avoid creating an extract_subvector with an index that isn't a multiple
1830 // of the result type.
1832 DAG.getConstant(LoNumElts, DL, MVT::i32));
1833 return {Lo, Hi};
1834 }
1835
1837 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1838 /*Count=*/HiNumElts);
1839 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1840 return {Lo, Hi};
1841 }
1842
1844 DAG.getVectorIdxConstant(LoNumElts, DL));
1845 return {Lo, Hi};
1846}
1847
1849 SelectionDAG &DAG) const {
1851 EVT VT = Op.getValueType();
1852 SDLoc SL(Op);
1853
1854
1855 // If this is a 2 element vector, we really want to scalarize and not create
1856 // weird 1 element vectors.
1857 if (VT.getVectorNumElements() == 2) {
1858 SDValue Ops[2];
1859 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1860 return DAG.getMergeValues(Ops, SL);
1861 }
1862
1863 SDValue BasePtr = Load->getBasePtr();
1864 EVT MemVT = Load->getMemoryVT();
1865
1866 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1867
1868 EVT LoVT, HiVT;
1869 EVT LoMemVT, HiMemVT;
1870 SDValue Lo, Hi;
1871
1872 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1873 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1874 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1875
1876 unsigned Size = LoMemVT.getStoreSize();
1877 Align BaseAlign = Load->getAlign();
1878 Align HiAlign = commonAlignment(BaseAlign, Size);
1879
1880 SDValue LoLoad = DAG.getExtLoad(
1881 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1882 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1883 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1884 SDValue HiLoad = DAG.getExtLoad(
1885 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1886 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1887 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1888
1889 SDValue Join;
1890 if (LoVT == HiVT) {
1891 // This is the case that the vector is power of two so was evenly split.
1892 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1893 } else {
1894 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1895 DAG.getVectorIdxConstant(0, SL));
1896 Join = DAG.getNode(
1898 VT, Join, HiLoad,
1900 }
1901
1902 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1903 LoLoad.getValue(1), HiLoad.getValue(1))};
1904
1905 return DAG.getMergeValues(Ops, SL);
1906}
1907
1909 SelectionDAG &DAG) const {
1911 EVT VT = Op.getValueType();
1912 SDValue BasePtr = Load->getBasePtr();
1913 EVT MemVT = Load->getMemoryVT();
1914 SDLoc SL(Op);
1915 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1916 Align BaseAlign = Load->getAlign();
1917 unsigned NumElements = MemVT.getVectorNumElements();
1918
1919 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1920 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1921 if (NumElements != 3 ||
1922 (BaseAlign < Align(8) &&
1923 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1924 return SplitVectorLoad(Op, DAG);
1925
1926 assert(NumElements == 3);
1927
1928 EVT WideVT =
1930 EVT WideMemVT =
1932 SDValue WideLoad = DAG.getExtLoad(
1933 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1934 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1935 return DAG.getMergeValues(
1936 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1937 DAG.getVectorIdxConstant(0, SL)),
1938 WideLoad.getValue(1)},
1939 SL);
1940}
1941
1943 SelectionDAG &DAG) const {
1945 SDValue Val = Store->getValue();
1946 EVT VT = Val.getValueType();
1947
1948 // If this is a 2 element vector, we really want to scalarize and not create
1949 // weird 1 element vectors.
1950 if (VT.getVectorNumElements() == 2)
1951 return scalarizeVectorStore(Store, DAG);
1952
1953 EVT MemVT = Store->getMemoryVT();
1954 SDValue Chain = Store->getChain();
1955 SDValue BasePtr = Store->getBasePtr();
1956 SDLoc SL(Op);
1957
1958 EVT LoVT, HiVT;
1959 EVT LoMemVT, HiMemVT;
1960 SDValue Lo, Hi;
1961
1962 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1963 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1964 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1965
1966 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1967
1968 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1969 Align BaseAlign = Store->getAlign();
1970 unsigned Size = LoMemVT.getStoreSize();
1971 Align HiAlign = commonAlignment(BaseAlign, Size);
1972
1973 SDValue LoStore =
1974 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1975 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1976 SDValue HiStore = DAG.getTruncStore(
1977 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1978 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1979
1980 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1981}
1982
1983// This is a shortcut for integer division because we have fast i32<->f32
1984// conversions, and fast f32 reciprocal instructions. The fractional part of a
1985// float is enough to accurately represent up to a 24-bit signed integer.
1987 bool Sign) const {
1988 SDLoc DL(Op);
1989 EVT VT = Op.getValueType();
1990 SDValue LHS = Op.getOperand(0);
1991 SDValue RHS = Op.getOperand(1);
1992 MVT IntVT = MVT::i32;
1993 MVT FltVT = MVT::f32;
1994
1995 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1996 if (LHSSignBits < 9)
1997 return SDValue();
1998
1999 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2000 if (RHSSignBits < 9)
2001 return SDValue();
2002
2003 unsigned BitSize = VT.getSizeInBits();
2004 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2005 unsigned DivBits = BitSize - SignBits;
2006 if (Sign)
2007 ++DivBits;
2008
2011
2012 SDValue jq = DAG.getConstant(1, DL, IntVT);
2013
2014 if (Sign) {
2015 // char|short jq = ia ^ ib;
2016 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2017
2018 // jq = jq >> (bitsize - 2)
2019 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2020 DAG.getConstant(BitSize - 2, DL, VT));
2021
2022 // jq = jq | 0x1
2023 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2024 }
2025
2026 // int ia = (int)LHS;
2027 SDValue ia = LHS;
2028
2029 // int ib, (int)RHS;
2030 SDValue ib = RHS;
2031
2032 // float fa = (float)ia;
2033 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2034
2035 // float fb = (float)ib;
2036 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2037
2038 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2039 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2040
2041 // fq = trunc(fq);
2042 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2043
2044 // float fqneg = -fq;
2045 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2046
2048
2049 bool UseFmadFtz = false;
2050 if (Subtarget->isGCN()) {
2052 UseFmadFtz =
2054 }
2055
2056 // float fr = mad(fqneg, fb, fa);
2057 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2058 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2060 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2061
2062 // int iq = (int)fq;
2063 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2064
2065 // fr = fabs(fr);
2066 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2067
2068 // fb = fabs(fb);
2069 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2070
2071 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2072
2073 // int cv = fr >= fb;
2074 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2075
2076 // jq = (cv ? jq : 0);
2077 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2078
2079 // dst = iq + jq;
2080 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2081
2082 // Rem needs compensation, it's easier to recompute it
2083 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2084 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2085
2086 // Truncate to number of bits this divide really is.
2087 if (Sign) {
2088 SDValue InRegSize
2089 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2090 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2091 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2092 } else {
2093 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2094 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2095 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2096 }
2097
2098 return DAG.getMergeValues({ Div, Rem }, DL);
2099}
2100
2102 SelectionDAG &DAG,
2104 SDLoc DL(Op);
2105 EVT VT = Op.getValueType();
2106
2107 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2108
2109 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2110
2111 SDValue One = DAG.getConstant(1, DL, HalfVT);
2112 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2113
2114 //HiLo split
2115 SDValue LHS_Lo, LHS_Hi;
2116 SDValue LHS = Op.getOperand(0);
2117 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2118
2119 SDValue RHS_Lo, RHS_Hi;
2120 SDValue RHS = Op.getOperand(1);
2121 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2122
2123 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2124 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2125
2126 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2127 LHS_Lo, RHS_Lo);
2128
2129 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2130 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2131
2132 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2133 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2134 return;
2135 }
2136
2137 if (isTypeLegal(MVT::i64)) {
2138 // The algorithm here is based on ideas from "Software Integer Division",
2139 // Tom Rodeheffer, August 2008.
2140
2143
2144 // Compute denominator reciprocal.
2145 unsigned FMAD =
2146 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2149 : (unsigned)AMDGPUISD::FMAD_FTZ;
2150
2151 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2152 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2153 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2154 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2155 Cvt_Lo);
2156 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2157 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2158 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2159 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2160 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2161 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2162 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2163 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2164 Mul1);
2165 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2166 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2167 SDValue Rcp64 = DAG.getBitcast(VT,
2168 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2169
2170 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2171 SDValue One64 = DAG.getConstant(1, DL, VT);
2172 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2173 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2174
2175 // First round of UNR (Unsigned integer Newton-Raphson).
2176 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2177 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2178 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2179 SDValue Mulhi1_Lo, Mulhi1_Hi;
2180 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2181 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2182 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2183 Mulhi1_Lo, Zero1);
2184 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2185 Mulhi1_Hi, Add1_Lo.getValue(1));
2186 SDValue Add1 = DAG.getBitcast(VT,
2187 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2188
2189 // Second round of UNR.
2190 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2191 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2192 SDValue Mulhi2_Lo, Mulhi2_Hi;
2193 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2194 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2195 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2196 Mulhi2_Lo, Zero1);
2197 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2198 Mulhi2_Hi, Add2_Lo.getValue(1));
2199 SDValue Add2 = DAG.getBitcast(VT,
2200 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2201
2202 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2203
2204 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2205
2206 SDValue Mul3_Lo, Mul3_Hi;
2207 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2208 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2209 Mul3_Lo, Zero1);
2210 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2211 Mul3_Hi, Sub1_Lo.getValue(1));
2212 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2213 SDValue Sub1 = DAG.getBitcast(VT,
2214 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2215
2216 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2217 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2218 ISD::SETUGE);
2219 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2220 ISD::SETUGE);
2221 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2222
2223 // TODO: Here and below portions of the code can be enclosed into if/endif.
2224 // Currently control flow is unconditional and we have 4 selects after
2225 // potential endif to substitute PHIs.
2226
2227 // if C3 != 0 ...
2228 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2229 RHS_Lo, Zero1);
2230 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2231 RHS_Hi, Sub1_Lo.getValue(1));
2232 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2233 Zero, Sub2_Lo.getValue(1));
2234 SDValue Sub2 = DAG.getBitcast(VT,
2235 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2236
2237 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2238
2239 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2240 ISD::SETUGE);
2241 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2242 ISD::SETUGE);
2243 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2244
2245 // if (C6 != 0)
2246 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2247
2248 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2249 RHS_Lo, Zero1);
2250 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2251 RHS_Hi, Sub2_Lo.getValue(1));
2252 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2253 Zero, Sub3_Lo.getValue(1));
2254 SDValue Sub3 = DAG.getBitcast(VT,
2255 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2256
2257 // endif C6
2258 // endif C3
2259
2260 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2261 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2262
2263 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2264 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2265
2266 Results.push_back(Div);
2267 Results.push_back(Rem);
2268
2269 return;
2270 }
2271
2272 // r600 expandion.
2273 // Get Speculative values
2274 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2275 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2276
2277 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2278 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2279 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2280
2281 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2282 SDValue DIV_Lo = Zero;
2283
2284 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2285
2286 for (unsigned i = 0; i < halfBitWidth; ++i) {
2287 const unsigned bitPos = halfBitWidth - i - 1;
2288 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2289 // Get value of high bit
2290 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2291 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2292 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2293
2294 // Shift
2295 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2296 // Add LHS high bit
2297 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2298
2299 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2300 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2301
2302 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2303
2304 // Update REM
2305 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2306 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2307 }
2308
2309 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2310 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2311 Results.push_back(DIV);
2312 Results.push_back(REM);
2313}
2314
2316 SelectionDAG &DAG) const {
2317 SDLoc DL(Op);
2318 EVT VT = Op.getValueType();
2319
2320 if (VT == MVT::i64) {
2322 LowerUDIVREM64(Op, DAG, Results);
2323 return DAG.getMergeValues(Results, DL);
2324 }
2325
2326 if (VT == MVT::i32) {
2327 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2328 return Res;
2329 }
2330
2331 SDValue X = Op.getOperand(0);
2332 SDValue Y = Op.getOperand(1);
2333
2334 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2335 // algorithm used here.
2336
2337 // Initial estimate of inv(y).
2338 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2339
2340 // One round of UNR.
2341 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2342 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2343 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2344 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2345
2346 // Quotient/remainder estimate.
2347 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2348 SDValue R =
2349 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2350
2351 // First quotient/remainder refinement.
2352 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2353 SDValue One = DAG.getConstant(1, DL, VT);
2354 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2355 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2356 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2357 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2358 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2359
2360 // Second quotient/remainder refinement.
2361 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2362 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2363 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2364 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2365 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2366
2367 return DAG.getMergeValues({Q, R}, DL);
2368}
2369
2371 SelectionDAG &DAG) const {
2372 SDLoc DL(Op);
2373 EVT VT = Op.getValueType();
2374
2375 SDValue LHS = Op.getOperand(0);
2376 SDValue RHS = Op.getOperand(1);
2377
2378 SDValue Zero = DAG.getConstant(0, DL, VT);
2379 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2380
2381 if (VT == MVT::i32) {
2382 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2383 return Res;
2384 }
2385
2386 if (VT == MVT::i64 &&
2387 DAG.ComputeNumSignBits(LHS) > 32 &&
2388 DAG.ComputeNumSignBits(RHS) > 32) {
2389 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2390
2391 //HiLo split
2392 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2393 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2394 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2395 LHS_Lo, RHS_Lo);
2396 SDValue Res[2] = {
2397 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2398 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2399 };
2400 return DAG.getMergeValues(Res, DL);
2401 }
2402
2403 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2404 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2405 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2406 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2407
2408 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2409 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2410
2411 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2412 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2413
2414 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2415 SDValue Rem = Div.getValue(1);
2416
2417 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2418 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2419
2420 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2421 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2422
2423 SDValue Res[2] = {
2424 Div,
2425 Rem
2426 };
2427 return DAG.getMergeValues(Res, DL);
2428}
2429
2431 SDLoc SL(Op);
2432 SDValue Src = Op.getOperand(0);
2433
2434 // result = trunc(src)
2435 // if (src > 0.0 && src != result)
2436 // result += 1.0
2437
2438 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2439
2440 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2441 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2442
2443 EVT SetCCVT =
2444 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2445
2446 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2447 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2448 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2449
2450 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2451 // TODO: Should this propagate fast-math-flags?
2452 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2453}
2454
2456 SelectionDAG &DAG) {
2457 const unsigned FractBits = 52;
2458 const unsigned ExpBits = 11;
2459
2460 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2461 Hi,
2462 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2463 DAG.getConstant(ExpBits, SL, MVT::i32));
2464 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2465 DAG.getConstant(1023, SL, MVT::i32));
2466
2467 return Exp;
2468}
2469
2471 SDLoc SL(Op);
2472 SDValue Src = Op.getOperand(0);
2473
2474 assert(Op.getValueType() == MVT::f64);
2475
2476 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2477
2478 // Extract the upper half, since this is where we will find the sign and
2479 // exponent.
2480 SDValue Hi = getHiHalf64(Src, DAG);
2481
2482 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2483
2484 const unsigned FractBits = 52;
2485
2486 // Extract the sign bit.
2487 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2488 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2489
2490 // Extend back to 64-bits.
2491 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2492 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2493
2494 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2495 const SDValue FractMask
2496 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2497
2498 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2499 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2500 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2501
2502 EVT SetCCVT =
2503 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2504
2505 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2506
2507 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2508 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2509
2510 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2511 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2512
2513 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2514}
2515
2517 SelectionDAG &DAG) const {
2518 SDLoc SL(Op);
2519 SDValue Src = Op.getOperand(0);
2520
2521 assert(Op.getValueType() == MVT::f64);
2522
2523 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2524 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2525 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2526
2527 // TODO: Should this propagate fast-math-flags?
2528
2529 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2530 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2531
2532 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2533
2534 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2535 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2536
2537 EVT SetCCVT =
2538 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2539 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2540
2541 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2542}
2543
2545 SelectionDAG &DAG) const {
2546 // FNEARBYINT and FRINT are the same, except in their handling of FP
2547 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2548 // rint, so just treat them as equivalent.
2549 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2550 Op.getOperand(0));
2551}
2552
2554 auto VT = Op.getValueType();
2555 auto Arg = Op.getOperand(0u);
2556 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2557}
2558
2559// XXX - May require not supporting f32 denormals?
2560
2561// Don't handle v2f16. The extra instructions to scalarize and repack around the
2562// compare and vselect end up producing worse code than scalarizing the whole
2563// operation.
2565 SDLoc SL(Op);
2566 SDValue X = Op.getOperand(0);
2567 EVT VT = Op.getValueType();
2568
2569 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2570
2571 // TODO: Should this propagate fast-math-flags?
2572
2573 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2574
2575 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2576
2577 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2578 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2579
2580 EVT SetCCVT =
2581 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2582
2583 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2584 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2585 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2586
2587 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2588 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2589}
2590
2592 SDLoc SL(Op);
2593 SDValue Src = Op.getOperand(0);
2594
2595 // result = trunc(src);
2596 // if (src < 0.0 && src != result)
2597 // result += -1.0.
2598
2599 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2600
2601 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2602 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2603
2604 EVT SetCCVT =
2605 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2606
2607 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2608 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2609 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2610
2611 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2612 // TODO: Should this propagate fast-math-flags?
2613 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2614}
2615
2616/// Return true if it's known that \p Src can never be an f32 denormal value.
2618 switch (Src.getOpcode()) {
2619 case ISD::FP_EXTEND:
2620 return Src.getOperand(0).getValueType() == MVT::f16;
2621 case ISD::FP16_TO_FP:
2622 case ISD::FFREXP:
2623 case ISD::FSQRT:
2624 case AMDGPUISD::LOG:
2625 case AMDGPUISD::EXP:
2626 return true;
2628 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2629 switch (IntrinsicID) {
2630 case Intrinsic::amdgcn_frexp_mant:
2631 case Intrinsic::amdgcn_log:
2632 case Intrinsic::amdgcn_log_clamp:
2633 case Intrinsic::amdgcn_exp2:
2634 case Intrinsic::amdgcn_sqrt:
2635 return true;
2636 default:
2637 return false;
2638 }
2639 }
2640 default:
2641 return false;
2642 }
2643
2644 llvm_unreachable("covered opcode switch");
2645}
2646
2648 SDNodeFlags Flags) {
2649 return Flags.hasApproximateFuncs();
2650}
2651
2660
2662 SDValue Src,
2663 SDNodeFlags Flags) const {
2664 SDLoc SL(Src);
2665 EVT VT = Src.getValueType();
2666 const fltSemantics &Semantics = VT.getFltSemantics();
2667 SDValue SmallestNormal =
2668 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2669
2670 // Want to scale denormals up, but negatives and 0 work just as well on the
2671 // scaled path.
2672 SDValue IsLtSmallestNormal = DAG.getSetCC(
2673 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2674 SmallestNormal, ISD::SETOLT);
2675
2676 return IsLtSmallestNormal;
2677}
2678
2680 SDNodeFlags Flags) const {
2681 SDLoc SL(Src);
2682 EVT VT = Src.getValueType();
2683 const fltSemantics &Semantics = VT.getFltSemantics();
2684 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2685
2686 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2687 SDValue IsFinite = DAG.getSetCC(
2688 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2689 Inf, ISD::SETOLT);
2690 return IsFinite;
2691}
2692
2693/// If denormal handling is required return the scaled input to FLOG2, and the
2694/// check for denormal range. Otherwise, return null values.
2695std::pair<SDValue, SDValue>
2697 SDValue Src, SDNodeFlags Flags) const {
2698 if (!needsDenormHandlingF32(DAG, Src, Flags))
2699 return {};
2700
2701 MVT VT = MVT::f32;
2702 const fltSemantics &Semantics = APFloat::IEEEsingle();
2703 SDValue SmallestNormal =
2704 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2705
2706 SDValue IsLtSmallestNormal = DAG.getSetCC(
2707 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2708 SmallestNormal, ISD::SETOLT);
2709
2710 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2711 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2712 SDValue ScaleFactor =
2713 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2714
2715 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2716 return {ScaledInput, IsLtSmallestNormal};
2717}
2718
2720 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2721 // If we have to handle denormals, scale up the input and adjust the result.
2722
2723 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2724 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2725
2726 SDLoc SL(Op);
2727 EVT VT = Op.getValueType();
2728 SDValue Src = Op.getOperand(0);
2729 SDNodeFlags Flags = Op->getFlags();
2730
2731 if (VT == MVT::f16) {
2732 // Nothing in half is a denormal when promoted to f32.
2733 assert(!isTypeLegal(VT));
2734 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2735 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2736 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2737 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2738 }
2739
2740 auto [ScaledInput, IsLtSmallestNormal] =
2741 getScaledLogInput(DAG, SL, Src, Flags);
2742 if (!ScaledInput)
2743 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2744
2745 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2746
2747 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2748 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2749 SDValue ResultOffset =
2750 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2751 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2752}
2753
2754static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2755 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2756 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2757 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2758}
2759
2761 SelectionDAG &DAG) const {
2762 SDValue X = Op.getOperand(0);
2763 EVT VT = Op.getValueType();
2764 SDNodeFlags Flags = Op->getFlags();
2765 SDLoc DL(Op);
2766 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2767 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2768
2769 const auto &Options = getTargetMachine().Options;
2770 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2771
2772 if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
2773 // Log and multiply in f32 is good enough for f16.
2774 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2775 }
2776
2777 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2778 if (VT == MVT::f16 && !isTypeLegal(MVT::f16)) {
2779 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2780 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2781 }
2782
2783 return Lowered;
2784 }
2785
2786 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2787 if (ScaledInput)
2788 X = ScaledInput;
2789
2790 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2791
2792 SDValue R;
2793 if (Subtarget->hasFastFMAF32()) {
2794 // c+cc are ln(2)/ln(10) to more than 49 bits
2795 const float c_log10 = 0x1.344134p-2f;
2796 const float cc_log10 = 0x1.09f79ep-26f;
2797
2798 // c + cc is ln(2) to more than 49 bits
2799 const float c_log = 0x1.62e42ep-1f;
2800 const float cc_log = 0x1.efa39ep-25f;
2801
2802 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2803 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2804 // This adds correction terms for which contraction may lead to an increase
2805 // in the error of the approximation, so disable it.
2806 Flags.setAllowContract(false);
2807 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2808 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2809 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2810 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2811 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2812 } else {
2813 // ch+ct is ln(2)/ln(10) to more than 36 bits
2814 const float ch_log10 = 0x1.344000p-2f;
2815 const float ct_log10 = 0x1.3509f6p-18f;
2816
2817 // ch + ct is ln(2) to more than 36 bits
2818 const float ch_log = 0x1.62e000p-1f;
2819 const float ct_log = 0x1.0bfbe8p-15f;
2820
2821 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2822 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2823
2824 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2825 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2826 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2827 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2828 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2829 // This adds correction terms for which contraction may lead to an increase
2830 // in the error of the approximation, so disable it.
2831 Flags.setAllowContract(false);
2832 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2833 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2834 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2835 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2836 }
2837
2838 const bool IsFiniteOnly =
2839 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2840
2841 // TODO: Check if known finite from source value.
2842 if (!IsFiniteOnly) {
2843 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2844 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2845 }
2846
2847 if (IsScaled) {
2848 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2849 SDValue ShiftK =
2850 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2851 SDValue Shift =
2852 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2853 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2854 }
2855
2856 return R;
2857}
2858
2862
2863// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2864// promote f16 operation.
2866 SelectionDAG &DAG, bool IsLog10,
2867 SDNodeFlags Flags) const {
2868 EVT VT = Src.getValueType();
2869 unsigned LogOp =
2870 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2871
2872 double Log2BaseInverted =
2874
2875 if (VT == MVT::f32) {
2876 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2877 if (ScaledInput) {
2878 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2879 SDValue ScaledResultOffset =
2880 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2881
2882 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2883
2884 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2885 ScaledResultOffset, Zero, Flags);
2886
2887 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2888
2889 if (Subtarget->hasFastFMAF32())
2890 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2891 Flags);
2892 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2893 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2894 }
2895 }
2896
2897 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2898 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2899
2900 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2901 Flags);
2902}
2903
2905 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2906 // If we have to handle denormals, scale up the input and adjust the result.
2907
2908 SDLoc SL(Op);
2909 EVT VT = Op.getValueType();
2910 SDValue Src = Op.getOperand(0);
2911 SDNodeFlags Flags = Op->getFlags();
2912
2913 if (VT == MVT::f16) {
2914 // Nothing in half is a denormal when promoted to f32.
2915 assert(!isTypeLegal(MVT::f16));
2916 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2917 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2918 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2919 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2920 }
2921
2922 assert(VT == MVT::f32);
2923
2924 if (!needsDenormHandlingF32(DAG, Src, Flags))
2925 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2926
2927 // bool needs_scaling = x < -0x1.f80000p+6f;
2928 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2929
2930 // -nextafter(128.0, -1)
2931 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2932
2933 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2934
2935 SDValue NeedsScaling =
2936 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2937
2938 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2939 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2940
2941 SDValue AddOffset =
2942 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2943
2944 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2945 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2946
2947 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2948 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2949 SDValue ResultScale =
2950 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2951
2952 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2953}
2954
2956 SelectionDAG &DAG,
2957 SDNodeFlags Flags,
2958 bool IsExp10) const {
2959 // exp(x) -> exp2(M_LOG2E_F * x);
2960 // exp10(x) -> exp2(log2(10) * x);
2961 EVT VT = X.getValueType();
2962 SDValue Const =
2963 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
2964
2965 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
2966 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2967 : (unsigned)ISD::FEXP2,
2968 SL, VT, Mul, Flags);
2969}
2970
2972 SelectionDAG &DAG,
2973 SDNodeFlags Flags) const {
2974 EVT VT = X.getValueType();
2975 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
2976 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2977
2978 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2979
2980 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2981 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2982
2983 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2984
2985 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2986
2987 SDValue AdjustedX =
2988 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2989
2990 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2991 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2992
2993 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2994
2995 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2996 SDValue AdjustedResult =
2997 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2998
2999 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3000 Flags);
3001}
3002
3003/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3004/// handled correctly.
3006 SelectionDAG &DAG,
3007 SDNodeFlags Flags) const {
3008 const EVT VT = X.getValueType();
3009
3010 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3011 : static_cast<unsigned>(ISD::FEXP2);
3012
3013 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3014 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3015 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3016 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3017
3018 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3019 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3020 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3021 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3022 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3023 }
3024
3025 // bool s = x < -0x1.2f7030p+5f;
3026 // x += s ? 0x1.0p+5f : 0.0f;
3027 // exp10 = exp2(x * 0x1.a92000p+1f) *
3028 // exp2(x * 0x1.4f0978p-11f) *
3029 // (s ? 0x1.9f623ep-107f : 1.0f);
3030
3031 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3032
3033 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3034 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3035
3036 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3037 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3038 SDValue AdjustedX =
3039 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3040
3041 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3042 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3043
3044 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3045 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3046 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3047 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3048
3049 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3050
3051 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3052 SDValue AdjustedResult =
3053 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3054
3055 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3056 Flags);
3057}
3058
3060 EVT VT = Op.getValueType();
3061 SDLoc SL(Op);
3062 SDValue X = Op.getOperand(0);
3063 SDNodeFlags Flags = Op->getFlags();
3064 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3065
3066 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3067 // library behavior. Also, is known-not-daz source sufficient?
3068 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3069 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3070 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3071 }
3072
3073 if (VT.getScalarType() == MVT::f16) {
3074 if (VT.isVector())
3075 return SDValue();
3076
3077 // Nothing in half is a denormal when promoted to f32.
3078 //
3079 // exp(f16 x) ->
3080 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3081 //
3082 // exp10(f16 x) ->
3083 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3084 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3085 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3086 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3087 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3088 }
3089
3090 assert(VT == MVT::f32);
3091
3092 // Algorithm:
3093 //
3094 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3095 //
3096 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3097 // n = 64*m + j, 0 <= j < 64
3098 //
3099 // e^x = 2^((64*m + j + f)/64)
3100 // = (2^m) * (2^(j/64)) * 2^(f/64)
3101 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3102 //
3103 // f = x*(64/ln(2)) - n
3104 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3105 //
3106 // e^x = (2^m) * (2^(j/64)) * e^r
3107 //
3108 // (2^(j/64)) is precomputed
3109 //
3110 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3111 // e^r = 1 + q
3112 //
3113 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3114 //
3115 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3116 SDNodeFlags FlagsNoContract = Flags;
3117 FlagsNoContract.setAllowContract(false);
3118
3119 SDValue PH, PL;
3120 if (Subtarget->hasFastFMAF32()) {
3121 const float c_exp = numbers::log2ef;
3122 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3123 const float c_exp10 = 0x1.a934f0p+1f;
3124 const float cc_exp10 = 0x1.2f346ep-24f;
3125
3126 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3127 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3128
3129 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3130 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3131 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3132 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3133 } else {
3134 const float ch_exp = 0x1.714000p+0f;
3135 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3136
3137 const float ch_exp10 = 0x1.a92000p+1f;
3138 const float cl_exp10 = 0x1.4f0978p-11f;
3139
3140 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3141 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3142
3143 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3144 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3145 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3146 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3147 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3148
3149 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3150
3151 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3152 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3153 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3154 }
3155
3156 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3157
3158 // It is unsafe to contract this fsub into the PH multiply.
3159 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3160
3161 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3162 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3163 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3164
3165 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3166
3167 SDValue UnderflowCheckConst =
3168 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3169
3170 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3171 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3172 SDValue Underflow =
3173 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3174
3175 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3176
3177 if (!Flags.hasNoInfs()) {
3178 SDValue OverflowCheckConst =
3179 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3180 SDValue Overflow =
3181 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3182 SDValue Inf =
3184 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3185 }
3186
3187 return R;
3188}
3189
3190static bool isCtlzOpc(unsigned Opc) {
3191 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3192}
3193
3194static bool isCttzOpc(unsigned Opc) {
3195 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3196}
3197
3199 SelectionDAG &DAG) const {
3200 auto SL = SDLoc(Op);
3201 auto Opc = Op.getOpcode();
3202 auto Arg = Op.getOperand(0u);
3203 auto ResultVT = Op.getValueType();
3204
3205 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3206 return {};
3207
3209 assert(ResultVT == Arg.getValueType());
3210
3211 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3212 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3213 SDValue NewOp;
3214
3215 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3216 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3217 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3218 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3219 } else {
3220 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3221 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3222 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3223 }
3224
3225 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3226}
3227
3229 SDLoc SL(Op);
3230 SDValue Src = Op.getOperand(0);
3231
3232 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3233 bool Ctlz = isCtlzOpc(Op.getOpcode());
3234 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3235
3236 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3237 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3238 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3239
3240 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3241 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3242 // (cttz hi:lo) -> (umin (ffbl src), 32)
3243 // (ctlz_zero_undef src) -> (ffbh src)
3244 // (cttz_zero_undef src) -> (ffbl src)
3245
3246 // 64-bit scalar version produce 32-bit result
3247 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3248 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3249 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3250 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3251 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3252 if (!ZeroUndef) {
3253 const SDValue ConstVal = DAG.getConstant(
3254 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3255 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3256 }
3257 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3258 }
3259
3260 SDValue Lo, Hi;
3261 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3262
3263 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3264 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3265
3266 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3267 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3268 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3269 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3270
3271 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3272 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3273 if (Ctlz)
3274 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3275 else
3276 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3277
3278 SDValue NewOpr;
3279 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3280 if (!ZeroUndef) {
3281 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3282 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3283 }
3284
3285 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3286}
3287
3289 bool Signed) const {
3290 // The regular method converting a 64-bit integer to float roughly consists of
3291 // 2 steps: normalization and rounding. In fact, after normalization, the
3292 // conversion from a 64-bit integer to a float is essentially the same as the
3293 // one from a 32-bit integer. The only difference is that it has more
3294 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3295 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3296 // converted into the correct float number. The basic steps for the unsigned
3297 // conversion are illustrated in the following pseudo code:
3298 //
3299 // f32 uitofp(i64 u) {
3300 // i32 hi, lo = split(u);
3301 // // Only count the leading zeros in hi as we have native support of the
3302 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3303 // // reduced to a 32-bit one automatically.
3304 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3305 // u <<= shamt;
3306 // hi, lo = split(u);
3307 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3308 // // convert it as a 32-bit integer and scale the result back.
3309 // return uitofp(hi) * 2^(32 - shamt);
3310 // }
3311 //
3312 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3313 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3314 // converted instead followed by negation based its sign bit.
3315
3316 SDLoc SL(Op);
3317 SDValue Src = Op.getOperand(0);
3318
3319 SDValue Lo, Hi;
3320 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3321 SDValue Sign;
3322 SDValue ShAmt;
3323 if (Signed && Subtarget->isGCN()) {
3324 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3325 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3326 // account. That is, the maximal shift is
3327 // - 32 if Lo and Hi have opposite signs;
3328 // - 33 if Lo and Hi have the same sign.
3329 //
3330 // Or, MaxShAmt = 33 + OppositeSign, where
3331 //
3332 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3333 // - -1 if Lo and Hi have opposite signs; and
3334 // - 0 otherwise.
3335 //
3336 // All in all, ShAmt is calculated as
3337 //
3338 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3339 //
3340 // or
3341 //
3342 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3343 //
3344 // to reduce the critical path.
3345 SDValue OppositeSign = DAG.getNode(
3346 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3347 DAG.getConstant(31, SL, MVT::i32));
3348 SDValue MaxShAmt =
3349 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3350 OppositeSign);
3351 // Count the leading sign bits.
3352 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3353 // Different from unsigned conversion, the shift should be one bit less to
3354 // preserve the sign bit.
3355 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3356 DAG.getConstant(1, SL, MVT::i32));
3357 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3358 } else {
3359 if (Signed) {
3360 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3361 // absolute value first.
3362 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3363 DAG.getConstant(63, SL, MVT::i64));
3364 SDValue Abs =
3365 DAG.getNode(ISD::XOR, SL, MVT::i64,
3366 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3367 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3368 }
3369 // Count the leading zeros.
3370 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3371 // The shift amount for signed integers is [0, 32].
3372 }
3373 // Normalize the given 64-bit integer.
3374 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3375 // Split it again.
3376 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3377 // Calculate the adjust bit for rounding.
3378 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3379 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3380 DAG.getConstant(1, SL, MVT::i32), Lo);
3381 // Get the 32-bit normalized integer.
3382 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3383 // Convert the normalized 32-bit integer into f32.
3384
3385 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3386 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3387 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3388
3389 // Finally, need to scale back the converted floating number as the original
3390 // 64-bit integer is converted as a 32-bit one.
3391 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3392 ShAmt);
3393 // On GCN, use LDEXP directly.
3394 if (UseLDEXP)
3395 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3396
3397 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3398 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3399 // exponent is enough to avoid overflowing into the sign bit.
3400 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3401 DAG.getConstant(23, SL, MVT::i32));
3402 SDValue IVal =
3403 DAG.getNode(ISD::ADD, SL, MVT::i32,
3404 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3405 if (Signed) {
3406 // Set the sign bit.
3407 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3408 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3409 DAG.getConstant(31, SL, MVT::i32));
3410 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3411 }
3412 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3413}
3414
3416 bool Signed) const {
3417 SDLoc SL(Op);
3418 SDValue Src = Op.getOperand(0);
3419
3420 SDValue Lo, Hi;
3421 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3422
3424 SL, MVT::f64, Hi);
3425
3426 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3427
3428 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3429 DAG.getConstant(32, SL, MVT::i32));
3430 // TODO: Should this propagate fast-math-flags?
3431 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3432}
3433
3435 SelectionDAG &DAG) const {
3436 // TODO: Factor out code common with LowerSINT_TO_FP.
3437 EVT DestVT = Op.getValueType();
3438 SDValue Src = Op.getOperand(0);
3439 EVT SrcVT = Src.getValueType();
3440
3441 if (SrcVT == MVT::i16) {
3442 if (DestVT == MVT::f16)
3443 return Op;
3444 SDLoc DL(Op);
3445
3446 // Promote src to i32
3447 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3448 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3449 }
3450
3451 if (DestVT == MVT::bf16) {
3452 SDLoc SL(Op);
3453 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3454 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3455 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3456 }
3457
3458 if (SrcVT != MVT::i64)
3459 return Op;
3460
3461 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3462 SDLoc DL(Op);
3463
3464 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3465 SDValue FPRoundFlag =
3466 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3467 SDValue FPRound =
3468 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3469
3470 return FPRound;
3471 }
3472
3473 if (DestVT == MVT::f32)
3474 return LowerINT_TO_FP32(Op, DAG, false);
3475
3476 assert(DestVT == MVT::f64);
3477 return LowerINT_TO_FP64(Op, DAG, false);
3478}
3479
3481 SelectionDAG &DAG) const {
3482 EVT DestVT = Op.getValueType();
3483
3484 SDValue Src = Op.getOperand(0);
3485 EVT SrcVT = Src.getValueType();
3486
3487 if (SrcVT == MVT::i16) {
3488 if (DestVT == MVT::f16)
3489 return Op;
3490
3491 SDLoc DL(Op);
3492 // Promote src to i32
3493 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3494 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3495 }
3496
3497 if (DestVT == MVT::bf16) {
3498 SDLoc SL(Op);
3499 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3500 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3501 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3502 }
3503
3504 if (SrcVT != MVT::i64)
3505 return Op;
3506
3507 // TODO: Factor out code common with LowerUINT_TO_FP.
3508
3509 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3510 SDLoc DL(Op);
3511 SDValue Src = Op.getOperand(0);
3512
3513 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3514 SDValue FPRoundFlag =
3515 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3516 SDValue FPRound =
3517 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3518
3519 return FPRound;
3520 }
3521
3522 if (DestVT == MVT::f32)
3523 return LowerINT_TO_FP32(Op, DAG, true);
3524
3525 assert(DestVT == MVT::f64);
3526 return LowerINT_TO_FP64(Op, DAG, true);
3527}
3528
3530 bool Signed) const {
3531 SDLoc SL(Op);
3532
3533 SDValue Src = Op.getOperand(0);
3534 EVT SrcVT = Src.getValueType();
3535
3536 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3537
3538 // The basic idea of converting a floating point number into a pair of 32-bit
3539 // integers is illustrated as follows:
3540 //
3541 // tf := trunc(val);
3542 // hif := floor(tf * 2^-32);
3543 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3544 // hi := fptoi(hif);
3545 // lo := fptoi(lof);
3546 //
3547 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3548 SDValue Sign;
3549 if (Signed && SrcVT == MVT::f32) {
3550 // However, a 32-bit floating point number has only 23 bits mantissa and
3551 // it's not enough to hold all the significant bits of `lof` if val is
3552 // negative. To avoid the loss of precision, We need to take the absolute
3553 // value after truncating and flip the result back based on the original
3554 // signedness.
3555 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3556 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3557 DAG.getConstant(31, SL, MVT::i32));
3558 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3559 }
3560
3561 SDValue K0, K1;
3562 if (SrcVT == MVT::f64) {
3563 K0 = DAG.getConstantFP(
3564 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3565 SrcVT);
3566 K1 = DAG.getConstantFP(
3567 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3568 SrcVT);
3569 } else {
3570 K0 = DAG.getConstantFP(
3571 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3572 K1 = DAG.getConstantFP(
3573 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3574 }
3575 // TODO: Should this propagate fast-math-flags?
3576 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3577
3578 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3579
3580 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3581
3582 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3584 SL, MVT::i32, FloorMul);
3585 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3586
3587 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3588 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3589
3590 if (Signed && SrcVT == MVT::f32) {
3591 assert(Sign);
3592 // Flip the result based on the signedness, which is either all 0s or 1s.
3593 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3594 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3595 // r := xor(r, sign) - sign;
3596 Result =
3597 DAG.getNode(ISD::SUB, SL, MVT::i64,
3598 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3599 }
3600
3601 return Result;
3602}
3603
3605 SDLoc DL(Op);
3606 SDValue N0 = Op.getOperand(0);
3607
3608 // Convert to target node to get known bits
3609 if (N0.getValueType() == MVT::f32)
3610 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3611
3612 if (Op->getFlags().hasApproximateFuncs()) {
3613 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3614 return SDValue();
3615 }
3616
3617 return LowerF64ToF16Safe(N0, DL, DAG);
3618}
3619
3620// return node in i32
3622 SelectionDAG &DAG) const {
3623 assert(Src.getSimpleValueType() == MVT::f64);
3624
3625 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3626 // TODO: We can generate better code for True16.
3627 const unsigned ExpMask = 0x7ff;
3628 const unsigned ExpBiasf64 = 1023;
3629 const unsigned ExpBiasf16 = 15;
3630 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3631 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3632 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3633 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3634 DAG.getConstant(32, DL, MVT::i64));
3635 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3636 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3637 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3638 DAG.getConstant(20, DL, MVT::i64));
3639 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3640 DAG.getConstant(ExpMask, DL, MVT::i32));
3641 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3642 // add the f16 bias (15) to get the biased exponent for the f16 format.
3643 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3644 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3645
3646 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3647 DAG.getConstant(8, DL, MVT::i32));
3648 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3649 DAG.getConstant(0xffe, DL, MVT::i32));
3650
3651 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3652 DAG.getConstant(0x1ff, DL, MVT::i32));
3653 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3654
3655 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3656 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3657
3658 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3659 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3660 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3661 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3662
3663 // N = M | (E << 12);
3664 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3665 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3666 DAG.getConstant(12, DL, MVT::i32)));
3667
3668 // B = clamp(1-E, 0, 13);
3669 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3670 One, E);
3671 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3672 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3673 DAG.getConstant(13, DL, MVT::i32));
3674
3675 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3676 DAG.getConstant(0x1000, DL, MVT::i32));
3677
3678 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3679 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3680 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3681 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3682
3683 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3684 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3685 DAG.getConstant(0x7, DL, MVT::i32));
3686 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3687 DAG.getConstant(2, DL, MVT::i32));
3688 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3689 One, Zero, ISD::SETEQ);
3690 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3691 One, Zero, ISD::SETGT);
3692 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3693 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3694
3695 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3696 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3697 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3698 I, V, ISD::SETEQ);
3699
3700 // Extract the sign bit.
3701 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3702 DAG.getConstant(16, DL, MVT::i32));
3703 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3704 DAG.getConstant(0x8000, DL, MVT::i32));
3705
3706 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3707}
3708
3710 SelectionDAG &DAG) const {
3711 SDValue Src = Op.getOperand(0);
3712 unsigned OpOpcode = Op.getOpcode();
3713 EVT SrcVT = Src.getValueType();
3714 EVT DestVT = Op.getValueType();
3715
3716 // Will be selected natively
3717 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3718 return Op;
3719
3720 if (SrcVT == MVT::bf16) {
3721 SDLoc DL(Op);
3722 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3723 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3724 }
3725
3726 // Promote i16 to i32
3727 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3728 SDLoc DL(Op);
3729
3730 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3731 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3732 }
3733
3734 if (DestVT != MVT::i64)
3735 return Op;
3736
3737 if (SrcVT == MVT::f16 ||
3738 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3739 SDLoc DL(Op);
3740
3741 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3742 unsigned Ext =
3744 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3745 }
3746
3747 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3748 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3749
3750 return SDValue();
3751}
3752
3754 SelectionDAG &DAG) const {
3755 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3756 MVT VT = Op.getSimpleValueType();
3757 MVT ScalarVT = VT.getScalarType();
3758
3759 assert(VT.isVector());
3760
3761 SDValue Src = Op.getOperand(0);
3762 SDLoc DL(Op);
3763
3764 // TODO: Don't scalarize on Evergreen?
3765 unsigned NElts = VT.getVectorNumElements();
3767 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3768
3769 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3770 for (unsigned I = 0; I < NElts; ++I)
3771 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3772
3773 return DAG.getBuildVector(VT, DL, Args);
3774}
3775
3776//===----------------------------------------------------------------------===//
3777// Custom DAG optimizations
3778//===----------------------------------------------------------------------===//
3779
3780static bool isU24(SDValue Op, SelectionDAG &DAG) {
3781 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3782}
3783
3784static bool isI24(SDValue Op, SelectionDAG &DAG) {
3785 EVT VT = Op.getValueType();
3786 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3787 // as unsigned 24-bit values.
3789}
3790
3793 SelectionDAG &DAG = DCI.DAG;
3794 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3795 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3796
3797 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3798 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3799 unsigned NewOpcode = Node24->getOpcode();
3800 if (IsIntrin) {
3801 unsigned IID = Node24->getConstantOperandVal(0);
3802 switch (IID) {
3803 case Intrinsic::amdgcn_mul_i24:
3804 NewOpcode = AMDGPUISD::MUL_I24;
3805 break;
3806 case Intrinsic::amdgcn_mul_u24:
3807 NewOpcode = AMDGPUISD::MUL_U24;
3808 break;
3809 case Intrinsic::amdgcn_mulhi_i24:
3810 NewOpcode = AMDGPUISD::MULHI_I24;
3811 break;
3812 case Intrinsic::amdgcn_mulhi_u24:
3813 NewOpcode = AMDGPUISD::MULHI_U24;
3814 break;
3815 default:
3816 llvm_unreachable("Expected 24-bit mul intrinsic");
3817 }
3818 }
3819
3820 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3821
3822 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3823 // the operands to have other uses, but will only perform simplifications that
3824 // involve bypassing some nodes for this user.
3825 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3826 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3827 if (DemandedLHS || DemandedRHS)
3828 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3829 DemandedLHS ? DemandedLHS : LHS,
3830 DemandedRHS ? DemandedRHS : RHS);
3831
3832 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3833 // operands if this node is the only user.
3834 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3835 return SDValue(Node24, 0);
3836 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3837 return SDValue(Node24, 0);
3838
3839 return SDValue();
3840}
3841
3842template <typename IntTy>
3844 uint32_t Width, const SDLoc &DL) {
3845 if (Width + Offset < 32) {
3846 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3847 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3848 if constexpr (std::is_signed_v<IntTy>) {
3849 return DAG.getSignedConstant(Result, DL, MVT::i32);
3850 } else {
3851 return DAG.getConstant(Result, DL, MVT::i32);
3852 }
3853 }
3854
3855 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3856}
3857
3858static bool hasVolatileUser(SDNode *Val) {
3859 for (SDNode *U : Val->users()) {
3860 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3861 if (M->isVolatile())
3862 return true;
3863 }
3864 }
3865
3866 return false;
3867}
3868
3870 // i32 vectors are the canonical memory type.
3871 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3872 return false;
3873
3874 if (!VT.isByteSized())
3875 return false;
3876
3877 unsigned Size = VT.getStoreSize();
3878
3879 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3880 return false;
3881
3882 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3883 return false;
3884
3885 return true;
3886}
3887
3888// Replace load of an illegal type with a bitcast from a load of a friendlier
3889// type.
3891 DAGCombinerInfo &DCI) const {
3892 if (!DCI.isBeforeLegalize())
3893 return SDValue();
3894
3896 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3897 return SDValue();
3898
3899 SDLoc SL(N);
3900 SelectionDAG &DAG = DCI.DAG;
3901 EVT VT = LN->getMemoryVT();
3902
3903 unsigned Size = VT.getStoreSize();
3904 Align Alignment = LN->getAlign();
3905 if (Alignment < Size && isTypeLegal(VT)) {
3906 unsigned IsFast;
3907 unsigned AS = LN->getAddressSpace();
3908
3909 // Expand unaligned loads earlier than legalization. Due to visitation order
3910 // problems during legalization, the emitted instructions to pack and unpack
3911 // the bytes again are not eliminated in the case of an unaligned copy.
3913 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3914 if (VT.isVector())
3915 return SplitVectorLoad(SDValue(LN, 0), DAG);
3916
3917 SDValue Ops[2];
3918 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3919
3920 return DAG.getMergeValues(Ops, SDLoc(N));
3921 }
3922
3923 if (!IsFast)
3924 return SDValue();
3925 }
3926
3927 if (!shouldCombineMemoryType(VT))
3928 return SDValue();
3929
3930 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3931
3932 SDValue NewLoad
3933 = DAG.getLoad(NewVT, SL, LN->getChain(),
3934 LN->getBasePtr(), LN->getMemOperand());
3935
3936 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3937 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3938 return SDValue(N, 0);
3939}
3940
3941// Replace store of an illegal type with a store of a bitcast to a friendlier
3942// type.
3944 DAGCombinerInfo &DCI) const {
3945 if (!DCI.isBeforeLegalize())
3946 return SDValue();
3947
3949 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3950 return SDValue();
3951
3952 EVT VT = SN->getMemoryVT();
3953 unsigned Size = VT.getStoreSize();
3954
3955 SDLoc SL(N);
3956 SelectionDAG &DAG = DCI.DAG;
3957 Align Alignment = SN->getAlign();
3958 if (Alignment < Size && isTypeLegal(VT)) {
3959 unsigned IsFast;
3960 unsigned AS = SN->getAddressSpace();
3961
3962 // Expand unaligned stores earlier than legalization. Due to visitation
3963 // order problems during legalization, the emitted instructions to pack and
3964 // unpack the bytes again are not eliminated in the case of an unaligned
3965 // copy.
3967 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3968 if (VT.isVector())
3969 return SplitVectorStore(SDValue(SN, 0), DAG);
3970
3971 return expandUnalignedStore(SN, DAG);
3972 }
3973
3974 if (!IsFast)
3975 return SDValue();
3976 }
3977
3978 if (!shouldCombineMemoryType(VT))
3979 return SDValue();
3980
3981 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3982 SDValue Val = SN->getValue();
3983
3984 //DCI.AddToWorklist(Val.getNode());
3985
3986 bool OtherUses = !Val.hasOneUse();
3987 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3988 if (OtherUses) {
3989 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3990 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3991 }
3992
3993 return DAG.getStore(SN->getChain(), SL, CastVal,
3994 SN->getBasePtr(), SN->getMemOperand());
3995}
3996
3997// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3998// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3999// issues.
4001 DAGCombinerInfo &DCI) const {
4002 SelectionDAG &DAG = DCI.DAG;
4003 SDValue N0 = N->getOperand(0);
4004
4005 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4006 // (vt2 (truncate (assertzext vt0:x, vt1)))
4007 if (N0.getOpcode() == ISD::TRUNCATE) {
4008 SDValue N1 = N->getOperand(1);
4009 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4010 SDLoc SL(N);
4011
4012 SDValue Src = N0.getOperand(0);
4013 EVT SrcVT = Src.getValueType();
4014 if (SrcVT.bitsGE(ExtVT)) {
4015 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4016 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4017 }
4018 }
4019
4020 return SDValue();
4021}
4022
4024 SDNode *N, DAGCombinerInfo &DCI) const {
4025 unsigned IID = N->getConstantOperandVal(0);
4026 switch (IID) {
4027 case Intrinsic::amdgcn_mul_i24:
4028 case Intrinsic::amdgcn_mul_u24:
4029 case Intrinsic::amdgcn_mulhi_i24:
4030 case Intrinsic::amdgcn_mulhi_u24:
4031 return simplifyMul24(N, DCI);
4032 case Intrinsic::amdgcn_fract:
4033 case Intrinsic::amdgcn_rsq:
4034 case Intrinsic::amdgcn_rcp_legacy:
4035 case Intrinsic::amdgcn_rsq_legacy:
4036 case Intrinsic::amdgcn_rsq_clamp:
4037 case Intrinsic::amdgcn_tanh:
4038 case Intrinsic::amdgcn_prng_b32: {
4039 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4040 SDValue Src = N->getOperand(1);
4041 return Src.isUndef() ? Src : SDValue();
4042 }
4043 case Intrinsic::amdgcn_frexp_exp: {
4044 // frexp_exp (fneg x) -> frexp_exp x
4045 // frexp_exp (fabs x) -> frexp_exp x
4046 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4047 SDValue Src = N->getOperand(1);
4048 SDValue PeekSign = peekFPSignOps(Src);
4049 if (PeekSign == Src)
4050 return SDValue();
4051 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4052 0);
4053 }
4054 default:
4055 return SDValue();
4056 }
4057}
4058
4059/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4060/// binary operation \p Opc to it with the corresponding constant operands.
4062 DAGCombinerInfo &DCI, const SDLoc &SL,
4063 unsigned Opc, SDValue LHS,
4064 uint32_t ValLo, uint32_t ValHi) const {
4065 SelectionDAG &DAG = DCI.DAG;
4066 SDValue Lo, Hi;
4067 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4068
4069 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4070 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4071
4072 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4073 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4074
4075 // Re-visit the ands. It's possible we eliminated one of them and it could
4076 // simplify the vector.
4077 DCI.AddToWorklist(Lo.getNode());
4078 DCI.AddToWorklist(Hi.getNode());
4079
4080 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4081 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4082}
4083
4085 DAGCombinerInfo &DCI) const {
4086 EVT VT = N->getValueType(0);
4087 SDValue LHS = N->getOperand(0);
4088 SDValue RHS = N->getOperand(1);
4090 SDLoc SL(N);
4091 SelectionDAG &DAG = DCI.DAG;
4092
4093 unsigned RHSVal;
4094 if (CRHS) {
4095 RHSVal = CRHS->getZExtValue();
4096 if (!RHSVal)
4097 return LHS;
4098
4099 switch (LHS->getOpcode()) {
4100 default:
4101 break;
4102 case ISD::ZERO_EXTEND:
4103 case ISD::SIGN_EXTEND:
4104 case ISD::ANY_EXTEND: {
4105 SDValue X = LHS->getOperand(0);
4106
4107 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4108 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4109 // Prefer build_vector as the canonical form if packed types are legal.
4110 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4111 SDValue Vec = DAG.getBuildVector(
4112 MVT::v2i16, SL,
4113 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4114 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4115 }
4116
4117 // shl (ext x) => zext (shl x), if shift does not overflow int
4118 if (VT != MVT::i64)
4119 break;
4120 KnownBits Known = DAG.computeKnownBits(X);
4121 unsigned LZ = Known.countMinLeadingZeros();
4122 if (LZ < RHSVal)
4123 break;
4124 EVT XVT = X.getValueType();
4125 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4126 return DAG.getZExtOrTrunc(Shl, SL, VT);
4127 }
4128 }
4129 }
4130
4131 if (VT.getScalarType() != MVT::i64)
4132 return SDValue();
4133
4134 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4135 // common case, splitting this into a move and a 32-bit shift is faster and
4136 // the same code size.
4137 KnownBits Known = DAG.computeKnownBits(RHS);
4138
4139 EVT ElementType = VT.getScalarType();
4140 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4141 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4142
4143 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4144 return SDValue();
4145 SDValue ShiftAmt;
4146
4147 if (CRHS) {
4148 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4149 TargetType);
4150 } else {
4151 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4152 const SDValue ShiftMask =
4153 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4154 // This AND instruction will clamp out of bounds shift values.
4155 // It will also be removed during later instruction selection.
4156 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4157 }
4158
4159 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4160 SDValue NewShift =
4161 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4162
4163 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4164 SDValue Vec;
4165
4166 if (VT.isVector()) {
4167 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4168 unsigned NElts = TargetType.getVectorNumElements();
4170 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4171
4172 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4173 for (unsigned I = 0; I != NElts; ++I)
4174 HiAndLoOps[2 * I + 1] = HiOps[I];
4175 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4176 } else {
4177 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4178 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4179 }
4180 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4181}
4182
4184 DAGCombinerInfo &DCI) const {
4185 SDValue RHS = N->getOperand(1);
4187 EVT VT = N->getValueType(0);
4188 SDValue LHS = N->getOperand(0);
4189 SelectionDAG &DAG = DCI.DAG;
4190 SDLoc SL(N);
4191
4192 if (VT.getScalarType() != MVT::i64)
4193 return SDValue();
4194
4195 // For C >= 32
4196 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4197
4198 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4199 // common case, splitting this into a move and a 32-bit shift is faster and
4200 // the same code size.
4201 KnownBits Known = DAG.computeKnownBits(RHS);
4202
4203 EVT ElementType = VT.getScalarType();
4204 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4205 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4206
4207 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4208 return SDValue();
4209
4210 SDValue ShiftFullAmt =
4211 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4212 SDValue ShiftAmt;
4213 if (CRHS) {
4214 unsigned RHSVal = CRHS->getZExtValue();
4215 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4216 TargetType);
4217 } else if (Known.getMinValue().getZExtValue() ==
4218 (ElementType.getSizeInBits() - 1)) {
4219 ShiftAmt = ShiftFullAmt;
4220 } else {
4221 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4222 const SDValue ShiftMask =
4223 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4224 // This AND instruction will clamp out of bounds shift values.
4225 // It will also be removed during later instruction selection.
4226 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4227 }
4228
4229 EVT ConcatType;
4230 SDValue Hi;
4231 SDLoc LHSSL(LHS);
4232 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4233 if (VT.isVector()) {
4234 unsigned NElts = TargetType.getVectorNumElements();
4235 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4236 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4237 SmallVector<SDValue, 8> HiOps(NElts);
4238 SmallVector<SDValue, 16> HiAndLoOps;
4239
4240 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4241 for (unsigned I = 0; I != NElts; ++I) {
4242 HiOps[I] = HiAndLoOps[2 * I + 1];
4243 }
4244 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4245 } else {
4246 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4247 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4248 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4249 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4250 }
4251
4252 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4253 SDValue HiShift;
4254 if (KnownLHS.isNegative()) {
4255 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4256 } else {
4257 Hi = DAG.getFreeze(Hi);
4258 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4259 }
4260 SDValue NewShift =
4261 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4262
4263 SDValue Vec;
4264 if (VT.isVector()) {
4265 unsigned NElts = TargetType.getVectorNumElements();
4268 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4269
4270 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4271 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4272 for (unsigned I = 0; I != NElts; ++I) {
4273 HiAndLoOps[2 * I + 1] = HiOps[I];
4274 HiAndLoOps[2 * I] = LoOps[I];
4275 }
4276 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4277 } else {
4278 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4279 }
4280 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4281}
4282
4284 DAGCombinerInfo &DCI) const {
4285 SDValue RHS = N->getOperand(1);
4287 EVT VT = N->getValueType(0);
4288 SDValue LHS = N->getOperand(0);
4289 SelectionDAG &DAG = DCI.DAG;
4290 SDLoc SL(N);
4291 unsigned RHSVal;
4292
4293 if (CRHS) {
4294 RHSVal = CRHS->getZExtValue();
4295
4296 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4297 // this improves the ability to match BFE patterns in isel.
4298 if (LHS.getOpcode() == ISD::AND) {
4299 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4300 unsigned MaskIdx, MaskLen;
4301 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4302 MaskIdx == RHSVal) {
4303 return DAG.getNode(ISD::AND, SL, VT,
4304 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4305 N->getOperand(1)),
4306 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4307 N->getOperand(1)));
4308 }
4309 }
4310 }
4311 }
4312
4313 if (VT.getScalarType() != MVT::i64)
4314 return SDValue();
4315
4316 // for C >= 32
4317 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4318
4319 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4320 // common case, splitting this into a move and a 32-bit shift is faster and
4321 // the same code size.
4322 KnownBits Known = DAG.computeKnownBits(RHS);
4323
4324 EVT ElementType = VT.getScalarType();
4325 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4326 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4327
4328 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4329 return SDValue();
4330
4331 SDValue ShiftAmt;
4332 if (CRHS) {
4333 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4334 TargetType);
4335 } else {
4336 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4337 const SDValue ShiftMask =
4338 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4339 // This AND instruction will clamp out of bounds shift values.
4340 // It will also be removed during later instruction selection.
4341 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4342 }
4343
4344 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4345 EVT ConcatType;
4346 SDValue Hi;
4347 SDLoc LHSSL(LHS);
4348 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4349 if (VT.isVector()) {
4350 unsigned NElts = TargetType.getVectorNumElements();
4351 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4352 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4353 SmallVector<SDValue, 8> HiOps(NElts);
4354 SmallVector<SDValue, 16> HiAndLoOps;
4355
4356 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4357 for (unsigned I = 0; I != NElts; ++I)
4358 HiOps[I] = HiAndLoOps[2 * I + 1];
4359 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4360 } else {
4361 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4362 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4363 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4364 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4365 }
4366
4367 SDValue NewShift =
4368 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4369
4370 SDValue Vec;
4371 if (VT.isVector()) {
4372 unsigned NElts = TargetType.getVectorNumElements();
4374 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4375
4376 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4377 for (unsigned I = 0; I != NElts; ++I)
4378 HiAndLoOps[2 * I] = LoOps[I];
4379 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4380 } else {
4381 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4382 }
4383 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4384}
4385
4387 SDNode *N, DAGCombinerInfo &DCI) const {
4388 SDLoc SL(N);
4389 SelectionDAG &DAG = DCI.DAG;
4390 EVT VT = N->getValueType(0);
4391 SDValue Src = N->getOperand(0);
4392
4393 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4394 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4395 SDValue Vec = Src.getOperand(0);
4396 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4397 SDValue Elt0 = Vec.getOperand(0);
4398 EVT EltVT = Elt0.getValueType();
4399 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4400 if (EltVT.isFloatingPoint()) {
4401 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4402 EltVT.changeTypeToInteger(), Elt0);
4403 }
4404
4405 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4406 }
4407 }
4408 }
4409
4410 // Equivalent of above for accessing the high element of a vector as an
4411 // integer operation.
4412 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4413 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4414 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4415 SDValue BV = stripBitcast(Src.getOperand(0));
4416 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4417 EVT SrcEltVT = BV.getOperand(0).getValueType();
4418 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4419 unsigned BitIndex = K->getZExtValue();
4420 unsigned PartIndex = BitIndex / SrcEltSize;
4421
4422 if (PartIndex * SrcEltSize == BitIndex &&
4423 PartIndex < BV.getNumOperands()) {
4424 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4425 SDValue SrcElt =
4426 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4427 BV.getOperand(PartIndex));
4428 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4429 }
4430 }
4431 }
4432 }
4433 }
4434
4435 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4436 //
4437 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4438 // i16 (trunc (srl (i32 (trunc x), K)))
4439 if (VT.getScalarSizeInBits() < 32) {
4440 EVT SrcVT = Src.getValueType();
4441 if (SrcVT.getScalarSizeInBits() > 32 &&
4442 (Src.getOpcode() == ISD::SRL ||
4443 Src.getOpcode() == ISD::SRA ||
4444 Src.getOpcode() == ISD::SHL)) {
4445 SDValue Amt = Src.getOperand(1);
4446 KnownBits Known = DAG.computeKnownBits(Amt);
4447
4448 // - For left shifts, do the transform as long as the shift
4449 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4450 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4451 // losing information stored in the high bits when truncating.
4452 const unsigned MaxCstSize =
4453 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4454 if (Known.getMaxValue().ule(MaxCstSize)) {
4455 EVT MidVT = VT.isVector() ?
4456 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4457 VT.getVectorNumElements()) : MVT::i32;
4458
4459 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4460 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4461 Src.getOperand(0));
4462 DCI.AddToWorklist(Trunc.getNode());
4463
4464 if (Amt.getValueType() != NewShiftVT) {
4465 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4466 DCI.AddToWorklist(Amt.getNode());
4467 }
4468
4469 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4470 Trunc, Amt);
4471 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4472 }
4473 }
4474 }
4475
4476 return SDValue();
4477}
4478
4479// We need to specifically handle i64 mul here to avoid unnecessary conversion
4480// instructions. If we only match on the legalized i64 mul expansion,
4481// SimplifyDemandedBits will be unable to remove them because there will be
4482// multiple uses due to the separate mul + mulh[su].
4483static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4484 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4485 if (Size <= 32) {
4486 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4487 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4488 }
4489
4490 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4491 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4492
4493 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4494 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4495
4496 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4497}
4498
4499/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4500/// return SDValue().
4501static SDValue getAddOneOp(const SDNode *V) {
4502 if (V->getOpcode() != ISD::ADD)
4503 return SDValue();
4504
4505 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4506}
4507
4509 DAGCombinerInfo &DCI) const {
4510 assert(N->getOpcode() == ISD::MUL);
4511 EVT VT = N->getValueType(0);
4512
4513 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4514 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4515 // unnecessarily). isDivergent() is used as an approximation of whether the
4516 // value is in an SGPR.
4517 if (!N->isDivergent())
4518 return SDValue();
4519
4520 unsigned Size = VT.getSizeInBits();
4521 if (VT.isVector() || Size > 64)
4522 return SDValue();
4523
4524 SelectionDAG &DAG = DCI.DAG;
4525 SDLoc DL(N);
4526
4527 SDValue N0 = N->getOperand(0);
4528 SDValue N1 = N->getOperand(1);
4529
4530 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4531 // matching.
4532
4533 // mul x, (add y, 1) -> add (mul x, y), x
4534 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4535 SDValue AddOp = getAddOneOp(V.getNode());
4536 if (!AddOp)
4537 return SDValue();
4538
4539 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4540 return U->getOpcode() == ISD::MUL;
4541 }))
4542 return AddOp;
4543
4544 return SDValue();
4545 };
4546
4547 // FIXME: The selection pattern is not properly checking for commuted
4548 // operands, so we have to place the mul in the LHS
4549 if (SDValue MulOper = IsFoldableAdd(N0)) {
4550 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4551 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4552 }
4553
4554 if (SDValue MulOper = IsFoldableAdd(N1)) {
4555 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4556 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4557 }
4558
4559 // There are i16 integer mul/mad.
4560 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4561 return SDValue();
4562
4563 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4564 // in the source into any_extends if the result of the mul is truncated. Since
4565 // we can assume the high bits are whatever we want, use the underlying value
4566 // to avoid the unknown high bits from interfering.
4567 if (N0.getOpcode() == ISD::ANY_EXTEND)
4568 N0 = N0.getOperand(0);
4569
4570 if (N1.getOpcode() == ISD::ANY_EXTEND)
4571 N1 = N1.getOperand(0);
4572
4573 SDValue Mul;
4574
4575 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4576 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4577 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4578 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4579 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4580 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4581 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4582 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4583 } else {
4584 return SDValue();
4585 }
4586
4587 // We need to use sext even for MUL_U24, because MUL_U24 is used
4588 // for signed multiply of 8 and 16-bit types.
4589 return DAG.getSExtOrTrunc(Mul, DL, VT);
4590}
4591
4592SDValue
4594 DAGCombinerInfo &DCI) const {
4595 if (N->getValueType(0) != MVT::i32)
4596 return SDValue();
4597
4598 SelectionDAG &DAG = DCI.DAG;
4599 SDLoc DL(N);
4600
4601 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4602 SDValue N0 = N->getOperand(0);
4603 SDValue N1 = N->getOperand(1);
4604
4605 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4606 // in the source into any_extends if the result of the mul is truncated. Since
4607 // we can assume the high bits are whatever we want, use the underlying value
4608 // to avoid the unknown high bits from interfering.
4609 if (N0.getOpcode() == ISD::ANY_EXTEND)
4610 N0 = N0.getOperand(0);
4611 if (N1.getOpcode() == ISD::ANY_EXTEND)
4612 N1 = N1.getOperand(0);
4613
4614 // Try to use two fast 24-bit multiplies (one for each half of the result)
4615 // instead of one slow extending multiply.
4616 unsigned LoOpcode = 0;
4617 unsigned HiOpcode = 0;
4618 if (Signed) {
4619 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4620 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4621 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4622 LoOpcode = AMDGPUISD::MUL_I24;
4623 HiOpcode = AMDGPUISD::MULHI_I24;
4624 }
4625 } else {
4626 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4627 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4628 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4629 LoOpcode = AMDGPUISD::MUL_U24;
4630 HiOpcode = AMDGPUISD::MULHI_U24;
4631 }
4632 }
4633 if (!LoOpcode)
4634 return SDValue();
4635
4636 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4637 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4638 DCI.CombineTo(N, Lo, Hi);
4639 return SDValue(N, 0);
4640}
4641
4643 DAGCombinerInfo &DCI) const {
4644 EVT VT = N->getValueType(0);
4645
4646 if (!Subtarget->hasMulI24() || VT.isVector())
4647 return SDValue();
4648
4649 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4650 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4651 // unnecessarily). isDivergent() is used as an approximation of whether the
4652 // value is in an SGPR.
4653 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4654 // valu op anyway)
4655 if (Subtarget->hasSMulHi() && !N->isDivergent())
4656 return SDValue();
4657
4658 SelectionDAG &DAG = DCI.DAG;
4659 SDLoc DL(N);
4660
4661 SDValue N0 = N->getOperand(0);
4662 SDValue N1 = N->getOperand(1);
4663
4664 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4665 return SDValue();
4666
4667 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4668 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4669
4670 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4671 DCI.AddToWorklist(Mulhi.getNode());
4672 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4673}
4674
4676 DAGCombinerInfo &DCI) const {
4677 EVT VT = N->getValueType(0);
4678
4679 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4680 return SDValue();
4681
4682 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4683 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4684 // unnecessarily). isDivergent() is used as an approximation of whether the
4685 // value is in an SGPR.
4686 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4687 // valu op anyway)
4688 if (!N->isDivergent() && Subtarget->hasSMulHi())
4689 return SDValue();
4690
4691 SelectionDAG &DAG = DCI.DAG;
4692 SDLoc DL(N);
4693
4694 SDValue N0 = N->getOperand(0);
4695 SDValue N1 = N->getOperand(1);
4696
4697 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4698 return SDValue();
4699
4700 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4701 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4702
4703 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4704 DCI.AddToWorklist(Mulhi.getNode());
4705 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4706}
4707
4708SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4709 SDValue Op,
4710 const SDLoc &DL,
4711 unsigned Opc) const {
4712 EVT VT = Op.getValueType();
4713 if (VT.bitsGT(MVT::i32))
4714 return SDValue();
4715
4716 if (VT != MVT::i32)
4717 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4718
4719 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4720 if (VT != MVT::i32)
4721 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4722
4723 return FFBX;
4724}
4725
4726// The native instructions return -1 on 0 input. Optimize out a select that
4727// produces -1 on 0.
4728//
4729// TODO: If zero is not undef, we could also do this if the output is compared
4730// against the bitwidth.
4731//
4732// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4734 SDValue LHS, SDValue RHS,
4735 DAGCombinerInfo &DCI) const {
4736 if (!isNullConstant(Cond.getOperand(1)))
4737 return SDValue();
4738
4739 SelectionDAG &DAG = DCI.DAG;
4740 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4741 SDValue CmpLHS = Cond.getOperand(0);
4742
4743 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4744 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4745 if (CCOpcode == ISD::SETEQ &&
4746 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4747 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4748 unsigned Opc =
4749 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4750 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4751 }
4752
4753 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4754 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4755 if (CCOpcode == ISD::SETNE &&
4756 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4757 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4758 unsigned Opc =
4759 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4760
4761 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4762 }
4763
4764 return SDValue();
4765}
4766
4768 unsigned Op,
4769 const SDLoc &SL,
4770 SDValue Cond,
4771 SDValue N1,
4772 SDValue N2) {
4773 SelectionDAG &DAG = DCI.DAG;
4774 EVT VT = N1.getValueType();
4775
4776 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4777 N1.getOperand(0), N2.getOperand(0));
4778 DCI.AddToWorklist(NewSelect.getNode());
4779 return DAG.getNode(Op, SL, VT, NewSelect);
4780}
4781
4782// Pull a free FP operation out of a select so it may fold into uses.
4783//
4784// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4785// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4786//
4787// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4788// select c, (fabs x), +k -> fabs (select c, x, k)
4789SDValue
4791 SDValue N) const {
4792 SelectionDAG &DAG = DCI.DAG;
4793 SDValue Cond = N.getOperand(0);
4794 SDValue LHS = N.getOperand(1);
4795 SDValue RHS = N.getOperand(2);
4796
4797 EVT VT = N.getValueType();
4798 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4799 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4801 return SDValue();
4802
4803 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4804 SDLoc(N), Cond, LHS, RHS);
4805 }
4806
4807 bool Inv = false;
4808 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4809 std::swap(LHS, RHS);
4810 Inv = true;
4811 }
4812
4813 // TODO: Support vector constants.
4815 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4816 !selectSupportsSourceMods(N.getNode())) {
4817 SDLoc SL(N);
4818 // If one side is an fneg/fabs and the other is a constant, we can push the
4819 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4820 SDValue NewLHS = LHS.getOperand(0);
4821 SDValue NewRHS = RHS;
4822
4823 // Careful: if the neg can be folded up, don't try to pull it back down.
4824 bool ShouldFoldNeg = true;
4825
4826 if (NewLHS.hasOneUse()) {
4827 unsigned Opc = NewLHS.getOpcode();
4828 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4829 ShouldFoldNeg = false;
4830 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4831 ShouldFoldNeg = false;
4832 }
4833
4834 if (ShouldFoldNeg) {
4835 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4836 return SDValue();
4837
4838 // We're going to be forced to use a source modifier anyway, there's no
4839 // point to pulling the negate out unless we can get a size reduction by
4840 // negating the constant.
4841 //
4842 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4843 // about cheaper constants.
4844 if (NewLHS.getOpcode() == ISD::FABS &&
4846 return SDValue();
4847
4849 return SDValue();
4850
4851 if (LHS.getOpcode() == ISD::FNEG)
4852 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4853
4854 if (Inv)
4855 std::swap(NewLHS, NewRHS);
4856
4857 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4858 Cond, NewLHS, NewRHS);
4859 DCI.AddToWorklist(NewSelect.getNode());
4860 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4861 }
4862 }
4863
4864 return SDValue();
4865}
4866
4868 DAGCombinerInfo &DCI) const {
4869 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4870 return Folded;
4871
4872 SDValue Cond = N->getOperand(0);
4873 if (Cond.getOpcode() != ISD::SETCC)
4874 return SDValue();
4875
4876 EVT VT = N->getValueType(0);
4877 SDValue LHS = Cond.getOperand(0);
4878 SDValue RHS = Cond.getOperand(1);
4879 SDValue CC = Cond.getOperand(2);
4880
4881 SDValue True = N->getOperand(1);
4882 SDValue False = N->getOperand(2);
4883
4884 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4885 SelectionDAG &DAG = DCI.DAG;
4886 if (DAG.isConstantValueOfAnyType(True) &&
4887 !DAG.isConstantValueOfAnyType(False)) {
4888 // Swap cmp + select pair to move constant to false input.
4889 // This will allow using VOPC cndmasks more often.
4890 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4891
4892 SDLoc SL(N);
4893 ISD::CondCode NewCC =
4894 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4895
4896 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4897 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4898 }
4899
4900 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4902 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4903 // Revisit this node so we can catch min3/max3/med3 patterns.
4904 //DCI.AddToWorklist(MinMax.getNode());
4905 return MinMax;
4906 }
4907 }
4908
4909 // There's no reason to not do this if the condition has other uses.
4910 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4911}
4912
4913static bool isInv2Pi(const APFloat &APF) {
4914 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4915 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4916 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4917
4918 return APF.bitwiseIsEqual(KF16) ||
4919 APF.bitwiseIsEqual(KF32) ||
4920 APF.bitwiseIsEqual(KF64);
4921}
4922
4923// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4924// additional cost to negate them.
4927 if (C->isZero())
4928 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4929
4930 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4931 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4932
4934}
4935
4941
4947
4948static unsigned inverseMinMax(unsigned Opc) {
4949 switch (Opc) {
4950 case ISD::FMAXNUM:
4951 return ISD::FMINNUM;
4952 case ISD::FMINNUM:
4953 return ISD::FMAXNUM;
4954 case ISD::FMAXNUM_IEEE:
4955 return ISD::FMINNUM_IEEE;
4956 case ISD::FMINNUM_IEEE:
4957 return ISD::FMAXNUM_IEEE;
4958 case ISD::FMAXIMUM:
4959 return ISD::FMINIMUM;
4960 case ISD::FMINIMUM:
4961 return ISD::FMAXIMUM;
4962 case ISD::FMAXIMUMNUM:
4963 return ISD::FMINIMUMNUM;
4964 case ISD::FMINIMUMNUM:
4965 return ISD::FMAXIMUMNUM;
4966 case AMDGPUISD::FMAX_LEGACY:
4967 return AMDGPUISD::FMIN_LEGACY;
4968 case AMDGPUISD::FMIN_LEGACY:
4969 return AMDGPUISD::FMAX_LEGACY;
4970 default:
4971 llvm_unreachable("invalid min/max opcode");
4972 }
4973}
4974
4975/// \return true if it's profitable to try to push an fneg into its source
4976/// instruction.
4978 // If the input has multiple uses and we can either fold the negate down, or
4979 // the other uses cannot, give up. This both prevents unprofitable
4980 // transformations and infinite loops: we won't repeatedly try to fold around
4981 // a negate that has no 'good' form.
4982 if (N0.hasOneUse()) {
4983 // This may be able to fold into the source, but at a code size cost. Don't
4984 // fold if the fold into the user is free.
4985 if (allUsesHaveSourceMods(N, 0))
4986 return false;
4987 } else {
4988 if (fnegFoldsIntoOp(N0.getNode()) &&
4990 return false;
4991 }
4992
4993 return true;
4994}
4995
4997 DAGCombinerInfo &DCI) const {
4998 SelectionDAG &DAG = DCI.DAG;
4999 SDValue N0 = N->getOperand(0);
5000 EVT VT = N->getValueType(0);
5001
5002 unsigned Opc = N0.getOpcode();
5003
5004 if (!shouldFoldFNegIntoSrc(N, N0))
5005 return SDValue();
5006
5007 SDLoc SL(N);
5008 switch (Opc) {
5009 case ISD::FADD: {
5010 if (!mayIgnoreSignedZero(N0))
5011 return SDValue();
5012
5013 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5014 SDValue LHS = N0.getOperand(0);
5015 SDValue RHS = N0.getOperand(1);
5016
5017 if (LHS.getOpcode() != ISD::FNEG)
5018 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5019 else
5020 LHS = LHS.getOperand(0);
5021
5022 if (RHS.getOpcode() != ISD::FNEG)
5023 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5024 else
5025 RHS = RHS.getOperand(0);
5026
5027 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5028 if (Res.getOpcode() != ISD::FADD)
5029 return SDValue(); // Op got folded away.
5030 if (!N0.hasOneUse())
5031 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5032 return Res;
5033 }
5034 case ISD::FMUL:
5035 case AMDGPUISD::FMUL_LEGACY: {
5036 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5037 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5038 SDValue LHS = N0.getOperand(0);
5039 SDValue RHS = N0.getOperand(1);
5040
5041 if (LHS.getOpcode() == ISD::FNEG)
5042 LHS = LHS.getOperand(0);
5043 else if (RHS.getOpcode() == ISD::FNEG)
5044 RHS = RHS.getOperand(0);
5045 else
5046 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5047
5048 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5049 if (Res.getOpcode() != Opc)
5050 return SDValue(); // Op got folded away.
5051 if (!N0.hasOneUse())
5052 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5053 return Res;
5054 }
5055 case ISD::FMA:
5056 case ISD::FMAD: {
5057 // TODO: handle llvm.amdgcn.fma.legacy
5058 if (!mayIgnoreSignedZero(N0))
5059 return SDValue();
5060
5061 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5062 SDValue LHS = N0.getOperand(0);
5063 SDValue MHS = N0.getOperand(1);
5064 SDValue RHS = N0.getOperand(2);
5065
5066 if (LHS.getOpcode() == ISD::FNEG)
5067 LHS = LHS.getOperand(0);
5068 else if (MHS.getOpcode() == ISD::FNEG)
5069 MHS = MHS.getOperand(0);
5070 else
5071 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5072
5073 if (RHS.getOpcode() != ISD::FNEG)
5074 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5075 else
5076 RHS = RHS.getOperand(0);
5077
5078 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5079 if (Res.getOpcode() != Opc)
5080 return SDValue(); // Op got folded away.
5081 if (!N0.hasOneUse())
5082 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5083 return Res;
5084 }
5085 case ISD::FMAXNUM:
5086 case ISD::FMINNUM:
5087 case ISD::FMAXNUM_IEEE:
5088 case ISD::FMINNUM_IEEE:
5089 case ISD::FMINIMUM:
5090 case ISD::FMAXIMUM:
5091 case ISD::FMINIMUMNUM:
5092 case ISD::FMAXIMUMNUM:
5093 case AMDGPUISD::FMAX_LEGACY:
5094 case AMDGPUISD::FMIN_LEGACY: {
5095 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5096 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5097 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5098 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5099
5100 SDValue LHS = N0.getOperand(0);
5101 SDValue RHS = N0.getOperand(1);
5102
5103 // 0 doesn't have a negated inline immediate.
5104 // TODO: This constant check should be generalized to other operations.
5106 return SDValue();
5107
5108 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5109 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5110 unsigned Opposite = inverseMinMax(Opc);
5111
5112 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5113 if (Res.getOpcode() != Opposite)
5114 return SDValue(); // Op got folded away.
5115 if (!N0.hasOneUse())
5116 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5117 return Res;
5118 }
5119 case AMDGPUISD::FMED3: {
5120 SDValue Ops[3];
5121 for (unsigned I = 0; I < 3; ++I)
5122 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5123
5124 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5125 if (Res.getOpcode() != AMDGPUISD::FMED3)
5126 return SDValue(); // Op got folded away.
5127
5128 if (!N0.hasOneUse()) {
5129 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5130 DAG.ReplaceAllUsesWith(N0, Neg);
5131
5132 for (SDNode *U : Neg->users())
5133 DCI.AddToWorklist(U);
5134 }
5135
5136 return Res;
5137 }
5138 case ISD::FP_EXTEND:
5139 case ISD::FTRUNC:
5140 case ISD::FRINT:
5141 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5142 case ISD::FROUNDEVEN:
5143 case ISD::FSIN:
5144 case ISD::FCANONICALIZE:
5145 case AMDGPUISD::RCP:
5146 case AMDGPUISD::RCP_LEGACY:
5147 case AMDGPUISD::RCP_IFLAG:
5148 case AMDGPUISD::SIN_HW: {
5149 SDValue CvtSrc = N0.getOperand(0);
5150 if (CvtSrc.getOpcode() == ISD::FNEG) {
5151 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5152 // (fneg (rcp (fneg x))) -> (rcp x)
5153 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5154 }
5155
5156 if (!N0.hasOneUse())
5157 return SDValue();
5158
5159 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5160 // (fneg (rcp x)) -> (rcp (fneg x))
5161 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5162 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5163 }
5164 case ISD::FP_ROUND: {
5165 SDValue CvtSrc = N0.getOperand(0);
5166
5167 if (CvtSrc.getOpcode() == ISD::FNEG) {
5168 // (fneg (fp_round (fneg x))) -> (fp_round x)
5169 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5170 CvtSrc.getOperand(0), N0.getOperand(1));
5171 }
5172
5173 if (!N0.hasOneUse())
5174 return SDValue();
5175
5176 // (fneg (fp_round x)) -> (fp_round (fneg x))
5177 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5178 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5179 }
5180 case ISD::FP16_TO_FP: {
5181 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5182 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5183 // Put the fneg back as a legal source operation that can be matched later.
5184 SDLoc SL(N);
5185
5186 SDValue Src = N0.getOperand(0);
5187 EVT SrcVT = Src.getValueType();
5188
5189 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5190 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5191 DAG.getConstant(0x8000, SL, SrcVT));
5192 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5193 }
5194 case ISD::SELECT: {
5195 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5196 // TODO: Invert conditions of foldFreeOpFromSelect
5197 return SDValue();
5198 }
5199 case ISD::BITCAST: {
5200 SDLoc SL(N);
5201 SDValue BCSrc = N0.getOperand(0);
5202 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5203 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5204 if (HighBits.getValueType().getSizeInBits() != 32 ||
5205 !fnegFoldsIntoOp(HighBits.getNode()))
5206 return SDValue();
5207
5208 // f64 fneg only really needs to operate on the high half of of the
5209 // register, so try to force it to an f32 operation to help make use of
5210 // source modifiers.
5211 //
5212 //
5213 // fneg (f64 (bitcast (build_vector x, y))) ->
5214 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5215 // (fneg (bitcast i32:y to f32)))
5216
5217 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5218 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5219 SDValue CastBack =
5220 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5221
5223 Ops.back() = CastBack;
5224 DCI.AddToWorklist(NegHi.getNode());
5225 SDValue Build =
5226 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5227 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5228
5229 if (!N0.hasOneUse())
5230 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5231 return Result;
5232 }
5233
5234 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5235 BCSrc.hasOneUse()) {
5236 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5237 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5238
5239 // TODO: Cast back result for multiple uses is beneficial in some cases.
5240
5241 SDValue LHS =
5242 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5243 SDValue RHS =
5244 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5245
5246 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5247 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5248
5249 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5250 NegRHS);
5251 }
5252
5253 return SDValue();
5254 }
5255 default:
5256 return SDValue();
5257 }
5258}
5259
5261 DAGCombinerInfo &DCI) const {
5262 SelectionDAG &DAG = DCI.DAG;
5263 SDValue N0 = N->getOperand(0);
5264
5265 if (!N0.hasOneUse())
5266 return SDValue();
5267
5268 switch (N0.getOpcode()) {
5269 case ISD::FP16_TO_FP: {
5270 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5271 SDLoc SL(N);
5272 SDValue Src = N0.getOperand(0);
5273 EVT SrcVT = Src.getValueType();
5274
5275 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5276 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5277 DAG.getConstant(0x7fff, SL, SrcVT));
5278 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5279 }
5280 default:
5281 return SDValue();
5282 }
5283}
5284
5286 DAGCombinerInfo &DCI) const {
5287 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5288 if (!CFP)
5289 return SDValue();
5290
5291 // XXX - Should this flush denormals?
5292 const APFloat &Val = CFP->getValueAPF();
5293 APFloat One(Val.getSemantics(), "1.0");
5294 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5295}
5296
5298 DAGCombinerInfo &DCI) const {
5299 SelectionDAG &DAG = DCI.DAG;
5300 SDLoc DL(N);
5301
5302 switch(N->getOpcode()) {
5303 default:
5304 break;
5305 case ISD::BITCAST: {
5306 EVT DestVT = N->getValueType(0);
5307
5308 // Push casts through vector builds. This helps avoid emitting a large
5309 // number of copies when materializing floating point vector constants.
5310 //
5311 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5312 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5313 if (DestVT.isVector()) {
5314 SDValue Src = N->getOperand(0);
5315 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5318 EVT SrcVT = Src.getValueType();
5319 unsigned NElts = DestVT.getVectorNumElements();
5320
5321 if (SrcVT.getVectorNumElements() == NElts) {
5322 EVT DestEltVT = DestVT.getVectorElementType();
5323
5324 SmallVector<SDValue, 8> CastedElts;
5325 SDLoc SL(N);
5326 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5327 SDValue Elt = Src.getOperand(I);
5328 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5329 }
5330
5331 return DAG.getBuildVector(DestVT, SL, CastedElts);
5332 }
5333 }
5334 }
5335
5336 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5337 break;
5338
5339 // Fold bitcasts of constants.
5340 //
5341 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5342 // TODO: Generalize and move to DAGCombiner
5343 SDValue Src = N->getOperand(0);
5345 SDLoc SL(N);
5346 uint64_t CVal = C->getZExtValue();
5347 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5348 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5349 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5350 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5351 }
5352
5354 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5355 SDLoc SL(N);
5356 uint64_t CVal = Val.getZExtValue();
5357 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5358 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5359 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5360
5361 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5362 }
5363
5364 break;
5365 }
5366 case ISD::SHL:
5367 case ISD::SRA:
5368 case ISD::SRL: {
5369 // Range metadata can be invalidated when loads are converted to legal types
5370 // (e.g. v2i64 -> v4i32).
5371 // Try to convert vector shl/sra/srl before type legalization so that range
5372 // metadata can be utilized.
5373 if (!(N->getValueType(0).isVector() &&
5376 break;
5377 if (N->getOpcode() == ISD::SHL)
5378 return performShlCombine(N, DCI);
5379 if (N->getOpcode() == ISD::SRA)
5380 return performSraCombine(N, DCI);
5381 return performSrlCombine(N, DCI);
5382 }
5383 case ISD::TRUNCATE:
5384 return performTruncateCombine(N, DCI);
5385 case ISD::MUL:
5386 return performMulCombine(N, DCI);
5387 case AMDGPUISD::MUL_U24:
5388 case AMDGPUISD::MUL_I24: {
5389 if (SDValue Simplified = simplifyMul24(N, DCI))
5390 return Simplified;
5391 break;
5392 }
5393 case AMDGPUISD::MULHI_I24:
5394 case AMDGPUISD::MULHI_U24:
5395 return simplifyMul24(N, DCI);
5396 case ISD::SMUL_LOHI:
5397 case ISD::UMUL_LOHI:
5398 return performMulLoHiCombine(N, DCI);
5399 case ISD::MULHS:
5400 return performMulhsCombine(N, DCI);
5401 case ISD::MULHU:
5402 return performMulhuCombine(N, DCI);
5403 case ISD::SELECT:
5404 return performSelectCombine(N, DCI);
5405 case ISD::FNEG:
5406 return performFNegCombine(N, DCI);
5407 case ISD::FABS:
5408 return performFAbsCombine(N, DCI);
5409 case AMDGPUISD::BFE_I32:
5410 case AMDGPUISD::BFE_U32: {
5411 assert(!N->getValueType(0).isVector() &&
5412 "Vector handling of BFE not implemented");
5413 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5414 if (!Width)
5415 break;
5416
5417 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5418 if (WidthVal == 0)
5419 return DAG.getConstant(0, DL, MVT::i32);
5420
5422 if (!Offset)
5423 break;
5424
5425 SDValue BitsFrom = N->getOperand(0);
5426 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5427
5428 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5429
5430 if (OffsetVal == 0) {
5431 // This is already sign / zero extended, so try to fold away extra BFEs.
5432 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5433
5434 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5435 if (OpSignBits >= SignBits)
5436 return BitsFrom;
5437
5438 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5439 if (Signed) {
5440 // This is a sign_extend_inreg. Replace it to take advantage of existing
5441 // DAG Combines. If not eliminated, we will match back to BFE during
5442 // selection.
5443
5444 // TODO: The sext_inreg of extended types ends, although we can could
5445 // handle them in a single BFE.
5446 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5447 DAG.getValueType(SmallVT));
5448 }
5449
5450 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5451 }
5452
5453 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5454 if (Signed) {
5455 return constantFoldBFE<int32_t>(DAG,
5456 CVal->getSExtValue(),
5457 OffsetVal,
5458 WidthVal,
5459 DL);
5460 }
5461
5462 return constantFoldBFE<uint32_t>(DAG,
5463 CVal->getZExtValue(),
5464 OffsetVal,
5465 WidthVal,
5466 DL);
5467 }
5468
5469 if ((OffsetVal + WidthVal) >= 32 &&
5470 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5471 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5472 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5473 BitsFrom, ShiftVal);
5474 }
5475
5476 if (BitsFrom.hasOneUse()) {
5477 APInt Demanded = APInt::getBitsSet(32,
5478 OffsetVal,
5479 OffsetVal + WidthVal);
5480
5481 KnownBits Known;
5483 !DCI.isBeforeLegalizeOps());
5484 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5485 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5486 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5487 DCI.CommitTargetLoweringOpt(TLO);
5488 }
5489 }
5490
5491 break;
5492 }
5493 case ISD::LOAD:
5494 return performLoadCombine(N, DCI);
5495 case ISD::STORE:
5496 return performStoreCombine(N, DCI);
5497 case AMDGPUISD::RCP:
5498 case AMDGPUISD::RCP_IFLAG:
5499 return performRcpCombine(N, DCI);
5500 case ISD::AssertZext:
5501 case ISD::AssertSext:
5502 return performAssertSZExtCombine(N, DCI);
5504 return performIntrinsicWOChainCombine(N, DCI);
5505 case AMDGPUISD::FMAD_FTZ: {
5506 SDValue N0 = N->getOperand(0);
5507 SDValue N1 = N->getOperand(1);
5508 SDValue N2 = N->getOperand(2);
5509 EVT VT = N->getValueType(0);
5510
5511 // FMAD_FTZ is a FMAD + flush denormals to zero.
5512 // We flush the inputs, the intermediate step, and the output.
5516 if (N0CFP && N1CFP && N2CFP) {
5517 const auto FTZ = [](const APFloat &V) {
5518 if (V.isDenormal()) {
5519 APFloat Zero(V.getSemantics(), 0);
5520 return V.isNegative() ? -Zero : Zero;
5521 }
5522 return V;
5523 };
5524
5525 APFloat V0 = FTZ(N0CFP->getValueAPF());
5526 APFloat V1 = FTZ(N1CFP->getValueAPF());
5527 APFloat V2 = FTZ(N2CFP->getValueAPF());
5529 V0 = FTZ(V0);
5531 return DAG.getConstantFP(FTZ(V0), DL, VT);
5532 }
5533 break;
5534 }
5535 }
5536 return SDValue();
5537}
5538
5539//===----------------------------------------------------------------------===//
5540// Helper functions
5541//===----------------------------------------------------------------------===//
5542
5544 const TargetRegisterClass *RC,
5545 Register Reg, EVT VT,
5546 const SDLoc &SL,
5547 bool RawReg) const {
5550 Register VReg;
5551
5552 if (!MRI.isLiveIn(Reg)) {
5553 VReg = MRI.createVirtualRegister(RC);
5554 MRI.addLiveIn(Reg, VReg);
5555 } else {
5556 VReg = MRI.getLiveInVirtReg(Reg);
5557 }
5558
5559 if (RawReg)
5560 return DAG.getRegister(VReg, VT);
5561
5562 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5563}
5564
5565// This may be called multiple times, and nothing prevents creating multiple
5566// objects at the same offset. See if we already defined this object.
5568 int64_t Offset) {
5569 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5570 if (MFI.getObjectOffset(I) == Offset) {
5571 assert(MFI.getObjectSize(I) == Size);
5572 return I;
5573 }
5574 }
5575
5576 return MFI.CreateFixedObject(Size, Offset, true);
5577}
5578
5580 EVT VT,
5581 const SDLoc &SL,
5582 int64_t Offset) const {
5584 MachineFrameInfo &MFI = MF.getFrameInfo();
5585 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5586
5587 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5588 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5589
5590 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5593}
5594
5596 const SDLoc &SL,
5597 SDValue Chain,
5598 SDValue ArgVal,
5599 int64_t Offset) const {
5603
5604 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5605 // Stores to the argument stack area are relative to the stack pointer.
5606 SDValue SP =
5607 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5608 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5609 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5611 return Store;
5612}
5613
5615 const TargetRegisterClass *RC,
5616 EVT VT, const SDLoc &SL,
5617 const ArgDescriptor &Arg) const {
5618 assert(Arg && "Attempting to load missing argument");
5619
5620 SDValue V = Arg.isRegister() ?
5621 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5622 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5623
5624 if (!Arg.isMasked())
5625 return V;
5626
5627 unsigned Mask = Arg.getMask();
5628 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5629 V = DAG.getNode(ISD::SRL, SL, VT, V,
5630 DAG.getShiftAmountConstant(Shift, VT, SL));
5631 return DAG.getNode(ISD::AND, SL, VT, V,
5632 DAG.getConstant(Mask >> Shift, SL, VT));
5633}
5634
5636 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5637 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5638 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5639 uint64_t ArgOffset =
5640 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5641 switch (Param) {
5642 case FIRST_IMPLICIT:
5643 return ArgOffset;
5644 case PRIVATE_BASE:
5646 case SHARED_BASE:
5647 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5648 case QUEUE_PTR:
5649 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5650 }
5651 llvm_unreachable("unexpected implicit parameter type");
5652}
5653
5659
5661 SelectionDAG &DAG, int Enabled,
5662 int &RefinementSteps,
5663 bool &UseOneConstNR,
5664 bool Reciprocal) const {
5665 EVT VT = Operand.getValueType();
5666
5667 if (VT == MVT::f32) {
5668 RefinementSteps = 0;
5669 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5670 }
5671
5672 // TODO: There is also f64 rsq instruction, but the documentation is less
5673 // clear on its precision.
5674
5675 return SDValue();
5676}
5677
5679 SelectionDAG &DAG, int Enabled,
5680 int &RefinementSteps) const {
5681 EVT VT = Operand.getValueType();
5682
5683 if (VT == MVT::f32) {
5684 // Reciprocal, < 1 ulp error.
5685 //
5686 // This reciprocal approximation converges to < 0.5 ulp error with one
5687 // newton rhapson performed with two fused multiple adds (FMAs).
5688
5689 RefinementSteps = 0;
5690 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5691 }
5692
5693 // TODO: There is also f64 rcp instruction, but the documentation is less
5694 // clear on its precision.
5695
5696 return SDValue();
5697}
5698
5699static unsigned workitemIntrinsicDim(unsigned ID) {
5700 switch (ID) {
5701 case Intrinsic::amdgcn_workitem_id_x:
5702 return 0;
5703 case Intrinsic::amdgcn_workitem_id_y:
5704 return 1;
5705 case Intrinsic::amdgcn_workitem_id_z:
5706 return 2;
5707 default:
5708 llvm_unreachable("not a workitem intrinsic");
5709 }
5710}
5711
5713 const SDValue Op, KnownBits &Known,
5714 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5715
5716 Known.resetAll(); // Don't know anything.
5717
5718 unsigned Opc = Op.getOpcode();
5719
5720 switch (Opc) {
5721 default:
5722 break;
5723 case AMDGPUISD::CARRY:
5724 case AMDGPUISD::BORROW: {
5725 Known.Zero = APInt::getHighBitsSet(32, 31);
5726 break;
5727 }
5728
5729 case AMDGPUISD::BFE_I32:
5730 case AMDGPUISD::BFE_U32: {
5731 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5732 if (!CWidth)
5733 return;
5734
5735 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5736
5737 if (Opc == AMDGPUISD::BFE_U32)
5738 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5739
5740 break;
5741 }
5742 case AMDGPUISD::FP_TO_FP16: {
5743 unsigned BitWidth = Known.getBitWidth();
5744
5745 // High bits are zero.
5747 break;
5748 }
5749 case AMDGPUISD::MUL_U24:
5750 case AMDGPUISD::MUL_I24: {
5751 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5752 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5753 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5754 RHSKnown.countMinTrailingZeros();
5755 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5756 // Skip extra check if all bits are known zeros.
5757 if (TrailZ >= 32)
5758 break;
5759
5760 // Truncate to 24 bits.
5761 LHSKnown = LHSKnown.trunc(24);
5762 RHSKnown = RHSKnown.trunc(24);
5763
5764 if (Opc == AMDGPUISD::MUL_I24) {
5765 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5766 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5767 unsigned MaxValBits = LHSValBits + RHSValBits;
5768 if (MaxValBits > 32)
5769 break;
5770 unsigned SignBits = 32 - MaxValBits + 1;
5771 bool LHSNegative = LHSKnown.isNegative();
5772 bool LHSNonNegative = LHSKnown.isNonNegative();
5773 bool LHSPositive = LHSKnown.isStrictlyPositive();
5774 bool RHSNegative = RHSKnown.isNegative();
5775 bool RHSNonNegative = RHSKnown.isNonNegative();
5776 bool RHSPositive = RHSKnown.isStrictlyPositive();
5777
5778 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5779 Known.Zero.setHighBits(SignBits);
5780 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5781 Known.One.setHighBits(SignBits);
5782 } else {
5783 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5784 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5785 unsigned MaxValBits = LHSValBits + RHSValBits;
5786 if (MaxValBits >= 32)
5787 break;
5788 Known.Zero.setBitsFrom(MaxValBits);
5789 }
5790 break;
5791 }
5792 case AMDGPUISD::PERM: {
5793 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5794 if (!CMask)
5795 return;
5796
5797 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5798 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5799 unsigned Sel = CMask->getZExtValue();
5800
5801 for (unsigned I = 0; I < 32; I += 8) {
5802 unsigned SelBits = Sel & 0xff;
5803 if (SelBits < 4) {
5804 SelBits *= 8;
5805 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5806 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5807 } else if (SelBits < 7) {
5808 SelBits = (SelBits & 3) * 8;
5809 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5810 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5811 } else if (SelBits == 0x0c) {
5812 Known.Zero |= 0xFFull << I;
5813 } else if (SelBits > 0x0c) {
5814 Known.One |= 0xFFull << I;
5815 }
5816 Sel >>= 8;
5817 }
5818 break;
5819 }
5820 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5821 Known.Zero.setHighBits(24);
5822 break;
5823 }
5824 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5825 Known.Zero.setHighBits(16);
5826 break;
5827 }
5828 case AMDGPUISD::LDS: {
5829 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5830 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5831
5832 Known.Zero.setHighBits(16);
5833 Known.Zero.setLowBits(Log2(Alignment));
5834 break;
5835 }
5836 case AMDGPUISD::SMIN3:
5837 case AMDGPUISD::SMAX3:
5838 case AMDGPUISD::SMED3:
5839 case AMDGPUISD::UMIN3:
5840 case AMDGPUISD::UMAX3:
5841 case AMDGPUISD::UMED3: {
5842 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5843 if (Known2.isUnknown())
5844 break;
5845
5846 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5847 if (Known1.isUnknown())
5848 break;
5849
5850 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5851 if (Known0.isUnknown())
5852 break;
5853
5854 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5855 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5856 Known.One = Known0.One & Known1.One & Known2.One;
5857 break;
5858 }
5860 unsigned IID = Op.getConstantOperandVal(0);
5861 switch (IID) {
5862 case Intrinsic::amdgcn_workitem_id_x:
5863 case Intrinsic::amdgcn_workitem_id_y:
5864 case Intrinsic::amdgcn_workitem_id_z: {
5865 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5867 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5868 break;
5869 }
5870 default:
5871 break;
5872 }
5873 }
5874 }
5875}
5876
5878 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5879 unsigned Depth) const {
5880 switch (Op.getOpcode()) {
5881 case AMDGPUISD::BFE_I32: {
5882 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5883 if (!Width)
5884 return 1;
5885
5886 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5887 if (!isNullConstant(Op.getOperand(1)))
5888 return SignBits;
5889
5890 // TODO: Could probably figure something out with non-0 offsets.
5891 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5892 return std::max(SignBits, Op0SignBits);
5893 }
5894
5895 case AMDGPUISD::BFE_U32: {
5896 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5897 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5898 }
5899
5900 case AMDGPUISD::CARRY:
5901 case AMDGPUISD::BORROW:
5902 return 31;
5903 case AMDGPUISD::BUFFER_LOAD_BYTE:
5904 return 25;
5905 case AMDGPUISD::BUFFER_LOAD_SHORT:
5906 return 17;
5907 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5908 return 24;
5909 case AMDGPUISD::BUFFER_LOAD_USHORT:
5910 return 16;
5911 case AMDGPUISD::FP_TO_FP16:
5912 return 16;
5913 case AMDGPUISD::SMIN3:
5914 case AMDGPUISD::SMAX3:
5915 case AMDGPUISD::SMED3:
5916 case AMDGPUISD::UMIN3:
5917 case AMDGPUISD::UMAX3:
5918 case AMDGPUISD::UMED3: {
5919 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5920 if (Tmp2 == 1)
5921 return 1; // Early out.
5922
5923 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5924 if (Tmp1 == 1)
5925 return 1; // Early out.
5926
5927 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5928 if (Tmp0 == 1)
5929 return 1; // Early out.
5930
5931 return std::min({Tmp0, Tmp1, Tmp2});
5932 }
5933 default:
5934 return 1;
5935 }
5936}
5937
5939 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
5940 const MachineRegisterInfo &MRI, unsigned Depth) const {
5941 const MachineInstr *MI = MRI.getVRegDef(R);
5942 if (!MI)
5943 return 1;
5944
5945 // TODO: Check range metadata on MMO.
5946 switch (MI->getOpcode()) {
5947 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5948 return 25;
5949 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5950 return 17;
5951 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5952 return 24;
5953 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5954 return 16;
5955 case AMDGPU::G_AMDGPU_SMED3:
5956 case AMDGPU::G_AMDGPU_UMED3: {
5957 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5958 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5959 if (Tmp2 == 1)
5960 return 1;
5961 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5962 if (Tmp1 == 1)
5963 return 1;
5964 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5965 if (Tmp0 == 1)
5966 return 1;
5967 return std::min({Tmp0, Tmp1, Tmp2});
5968 }
5969 default:
5970 return 1;
5971 }
5972}
5973
5975 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5976 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
5977 unsigned Opcode = Op.getOpcode();
5978 switch (Opcode) {
5979 case AMDGPUISD::BFE_I32:
5980 case AMDGPUISD::BFE_U32:
5981 return false;
5982 }
5984 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
5985}
5986
5988 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
5989 unsigned Depth) const {
5990 unsigned Opcode = Op.getOpcode();
5991 switch (Opcode) {
5992 case AMDGPUISD::FMIN_LEGACY:
5993 case AMDGPUISD::FMAX_LEGACY: {
5994 if (SNaN)
5995 return true;
5996
5997 // TODO: Can check no nans on one of the operands for each one, but which
5998 // one?
5999 return false;
6000 }
6001 case AMDGPUISD::FMUL_LEGACY:
6002 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6003 if (SNaN)
6004 return true;
6005 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6006 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6007 }
6008 case AMDGPUISD::FMED3:
6009 case AMDGPUISD::FMIN3:
6010 case AMDGPUISD::FMAX3:
6011 case AMDGPUISD::FMINIMUM3:
6012 case AMDGPUISD::FMAXIMUM3:
6013 case AMDGPUISD::FMAD_FTZ: {
6014 if (SNaN)
6015 return true;
6016 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6017 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6018 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6019 }
6020 case AMDGPUISD::CVT_F32_UBYTE0:
6021 case AMDGPUISD::CVT_F32_UBYTE1:
6022 case AMDGPUISD::CVT_F32_UBYTE2:
6023 case AMDGPUISD::CVT_F32_UBYTE3:
6024 return true;
6025
6026 case AMDGPUISD::RCP:
6027 case AMDGPUISD::RSQ:
6028 case AMDGPUISD::RCP_LEGACY:
6029 case AMDGPUISD::RSQ_CLAMP: {
6030 if (SNaN)
6031 return true;
6032
6033 // TODO: Need is known positive check.
6034 return false;
6035 }
6036 case ISD::FLDEXP:
6037 case AMDGPUISD::FRACT: {
6038 if (SNaN)
6039 return true;
6040 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6041 }
6042 case AMDGPUISD::DIV_SCALE:
6043 case AMDGPUISD::DIV_FMAS:
6044 case AMDGPUISD::DIV_FIXUP:
6045 // TODO: Refine on operands.
6046 return SNaN;
6047 case AMDGPUISD::SIN_HW:
6048 case AMDGPUISD::COS_HW: {
6049 // TODO: Need check for infinity
6050 return SNaN;
6051 }
6053 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6054 // TODO: Handle more intrinsics
6055 switch (IntrinsicID) {
6056 case Intrinsic::amdgcn_cubeid:
6057 case Intrinsic::amdgcn_cvt_off_f32_i4:
6058 return true;
6059
6060 case Intrinsic::amdgcn_frexp_mant: {
6061 if (SNaN)
6062 return true;
6063 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6064 }
6065 case Intrinsic::amdgcn_cvt_pkrtz: {
6066 if (SNaN)
6067 return true;
6068 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6069 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6070 }
6071 case Intrinsic::amdgcn_rcp:
6072 case Intrinsic::amdgcn_rsq:
6073 case Intrinsic::amdgcn_rcp_legacy:
6074 case Intrinsic::amdgcn_rsq_legacy:
6075 case Intrinsic::amdgcn_rsq_clamp:
6076 case Intrinsic::amdgcn_tanh: {
6077 if (SNaN)
6078 return true;
6079
6080 // TODO: Need is known positive check.
6081 return false;
6082 }
6083 case Intrinsic::amdgcn_trig_preop:
6084 case Intrinsic::amdgcn_fdot2:
6085 // TODO: Refine on operand
6086 return SNaN;
6087 case Intrinsic::amdgcn_fma_legacy:
6088 if (SNaN)
6089 return true;
6090 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6091 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6092 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6093 default:
6094 return false;
6095 }
6096 }
6097 default:
6098 return false;
6099 }
6100}
6101
6103 Register N0, Register N1) const {
6104 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6105}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1477
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1244
const fltSemantics & getSemantics() const
Definition APFloat.h:1520
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1262
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1221
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1161
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1549
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1400
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1394
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1151
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1397
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:896
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1632
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:255
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:164
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:309
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:261
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:132
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:282
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...