LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Lower floating point store/load to integer store/load to reduce the number
71 // of patterns in tablegen.
73 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
74
76 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
101
102 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
104
105 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
107
108 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
110
111 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
113
115 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
143
144 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
145 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
149
151 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
152
153 // TODO: Would be better to consume as directly legal
155 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
156
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
177
178 // There are no 64-bit extloads. These should be done as a 32-bit extload and
179 // an extension to 64-bit.
180 for (MVT VT : MVT::integer_valuetypes())
182 Expand);
183
184 for (MVT VT : MVT::integer_valuetypes()) {
185 if (VT == MVT::i64)
186 continue;
187
188 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
189 setLoadExtAction(Op, VT, MVT::i1, Promote);
190 setLoadExtAction(Op, VT, MVT::i8, Legal);
191 setLoadExtAction(Op, VT, MVT::i16, Legal);
192 setLoadExtAction(Op, VT, MVT::i32, Expand);
193 }
194 }
195
197 for (auto MemVT :
198 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
200 Expand);
201
202 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
203 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
204 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
205 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
216
217 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
223
224 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
236
238 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
239
241 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
278
280 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
281
283 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
284
286 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
287
289 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
317
318 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
319 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
320 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
321 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
322
323 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
324 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
325 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
326 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
327
328 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
329 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
330 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
331 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
332 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
333 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
334 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
335 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
336 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
337 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
338 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
339 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
340 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
341 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
342
343 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
344 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
345 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
346
347 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
348 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
349 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
350
351 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
352
353 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
354 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
355 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
356 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
357 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
358 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
359 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
360
361 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
362 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
363 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
364 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
365 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
366
367 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
368 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
369 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
370
371 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
372 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
373 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
374 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
375 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
376 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
379
380 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
381 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
382
384
385 // For R600, this is totally unsupported, just custom lower to produce an
386 // error.
388
389 // Library functions. These default to Expand, but we have instructions
390 // for them.
393 MVT::f32, Legal);
394
396 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
397
400 Custom);
401
402 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
403
404 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
405
406 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
407
408 if (Subtarget->has16BitInsts())
409 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
410 else {
411 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
413 }
414
416 Custom);
417
418 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
419 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
420 // default unless marked custom/legal.
423 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
424 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
425 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
426 Custom);
427
428 // Expand to fneg + fadd.
430
432 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
433 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
434 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
435 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
436 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
437 Custom);
438
439 // FIXME: Why is v8f16/v8bf16 missing?
442 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
443 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
444 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
445 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
446 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
447 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
448 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
449 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
450 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
451 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
452 Custom);
453
455 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
456
457 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
458 for (MVT VT : ScalarIntVTs) {
459 // These should use [SU]DIVREM, so set them to expand
461 Expand);
462
463 // GPU does not have divrem function for signed or unsigned.
465
466 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
468
470
471 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
473 }
474
475 // The hardware supports 32-bit FSHR, but not FSHL.
477
478 // The hardware supports 32-bit ROTR, but not ROTL.
479 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
481
483
487 MVT::i64, Custom);
489
491 Legal);
492
495 MVT::i64, Custom);
496
497 for (auto VT : {MVT::i8, MVT::i16})
499
500 static const MVT::SimpleValueType VectorIntTypes[] = {
501 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
502 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
503
504 for (MVT VT : VectorIntTypes) {
505 // Expand the following operations for the current type by default.
517 ISD::SETCC},
518 VT, Expand);
519 }
520
521 static const MVT::SimpleValueType FloatVectorTypes[] = {
522 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
523 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
524
525 for (MVT VT : FloatVectorTypes) {
538 VT, Expand);
539 }
540
541 // This causes using an unrolled select operation rather than expansion with
542 // bit operations. This is in general better, but the alternative using BFI
543 // instructions may be better if the select sources are SGPRs.
545 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
546
548 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
549
551 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
552
554 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
555
557 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
558
560 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
561
563 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
564
566 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
567
569 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
570
572 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
573
574 // Disable most libcalls.
575 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
576 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
577 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
578 }
579
581 setJumpIsExpensive(true);
582
583 // FIXME: This is only partially true. If we have to do vector compares, any
584 // SGPR pair can be a condition register. If we have a uniform condition, we
585 // are better off doing SALU operations, where there is only one SCC. For now,
586 // we don't have a way of knowing during instruction selection if a condition
587 // will be uniform and we always use vector compares. Assume we are using
588 // vector compares until that is fixed.
590
593
595
596 // We want to find all load dependencies for long chains of stores to enable
597 // merging into very wide vectors. The problem is with vectors with > 4
598 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
599 // vectors are a legal type, even though we have to split the loads
600 // usually. When we can more precisely specify load legality per address
601 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
602 // smarter so that they can figure out what to do in 2 iterations without all
603 // N > 4 stores on the same chain.
605
606 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
607 // about these during lowering.
608 MaxStoresPerMemcpy = 0xffffffff;
609 MaxStoresPerMemmove = 0xffffffff;
610 MaxStoresPerMemset = 0xffffffff;
611
612 // The expansion for 64-bit division is enormous.
614 addBypassSlowDiv(64, 32);
615
626
630}
631
633 if (getTargetMachine().Options.NoSignedZerosFPMath)
634 return true;
635
636 const auto Flags = Op.getNode()->getFlags();
637 if (Flags.hasNoSignedZeros())
638 return true;
639
640 return false;
641}
642
643//===----------------------------------------------------------------------===//
644// Target Information
645//===----------------------------------------------------------------------===//
646
648static bool fnegFoldsIntoOpcode(unsigned Opc) {
649 switch (Opc) {
650 case ISD::FADD:
651 case ISD::FSUB:
652 case ISD::FMUL:
653 case ISD::FMA:
654 case ISD::FMAD:
655 case ISD::FMINNUM:
656 case ISD::FMAXNUM:
659 case ISD::FMINIMUM:
660 case ISD::FMAXIMUM:
661 case ISD::SELECT:
662 case ISD::FSIN:
663 case ISD::FTRUNC:
664 case ISD::FRINT:
665 case ISD::FNEARBYINT:
666 case ISD::FROUNDEVEN:
668 case AMDGPUISD::RCP:
675 case AMDGPUISD::FMED3:
676 // TODO: handle llvm.amdgcn.fma.legacy
677 return true;
678 case ISD::BITCAST:
679 llvm_unreachable("bitcast is special cased");
680 default:
681 return false;
682 }
683}
684
685static bool fnegFoldsIntoOp(const SDNode *N) {
686 unsigned Opc = N->getOpcode();
687 if (Opc == ISD::BITCAST) {
688 // TODO: Is there a benefit to checking the conditions performFNegCombine
689 // does? We don't for the other cases.
690 SDValue BCSrc = N->getOperand(0);
691 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
692 return BCSrc.getNumOperands() == 2 &&
693 BCSrc.getOperand(1).getValueSizeInBits() == 32;
694 }
695
696 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
697 }
698
699 return fnegFoldsIntoOpcode(Opc);
700}
701
702/// \p returns true if the operation will definitely need to use a 64-bit
703/// encoding, and thus will use a VOP3 encoding regardless of the source
704/// modifiers.
706static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
707 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
708 VT == MVT::f64;
709}
710
711/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
712/// type for ISD::SELECT.
714static bool selectSupportsSourceMods(const SDNode *N) {
715 // TODO: Only applies if select will be vector
716 return N->getValueType(0) == MVT::f32;
717}
718
719// Most FP instructions support source modifiers, but this could be refined
720// slightly.
722static bool hasSourceMods(const SDNode *N) {
723 if (isa<MemSDNode>(N))
724 return false;
725
726 switch (N->getOpcode()) {
727 case ISD::CopyToReg:
728 case ISD::FDIV:
729 case ISD::FREM:
730 case ISD::INLINEASM:
734
735 // TODO: Should really be looking at the users of the bitcast. These are
736 // problematic because bitcasts are used to legalize all stores to integer
737 // types.
738 case ISD::BITCAST:
739 return false;
741 switch (N->getConstantOperandVal(0)) {
742 case Intrinsic::amdgcn_interp_p1:
743 case Intrinsic::amdgcn_interp_p2:
744 case Intrinsic::amdgcn_interp_mov:
745 case Intrinsic::amdgcn_interp_p1_f16:
746 case Intrinsic::amdgcn_interp_p2_f16:
747 return false;
748 default:
749 return true;
750 }
751 }
752 case ISD::SELECT:
754 default:
755 return true;
756 }
757}
758
760 unsigned CostThreshold) {
761 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
762 // it is truly free to use a source modifier in all cases. If there are
763 // multiple users but for each one will necessitate using VOP3, there will be
764 // a code size increase. Try to avoid increasing code size unless we know it
765 // will save on the instruction count.
766 unsigned NumMayIncreaseSize = 0;
767 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
768
769 assert(!N->use_empty());
770
771 // XXX - Should this limit number of uses to check?
772 for (const SDNode *U : N->uses()) {
773 if (!hasSourceMods(U))
774 return false;
775
776 if (!opMustUseVOP3Encoding(U, VT)) {
777 if (++NumMayIncreaseSize > CostThreshold)
778 return false;
779 }
780 }
781
782 return true;
783}
784
786 ISD::NodeType ExtendKind) const {
787 assert(!VT.isVector() && "only scalar expected");
788
789 // Round to the next multiple of 32-bits.
790 unsigned Size = VT.getSizeInBits();
791 if (Size <= 32)
792 return MVT::i32;
793 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
794}
795
797 return MVT::i32;
798}
799
801 return true;
802}
803
804// The backend supports 32 and 64 bit floating point immediates.
805// FIXME: Why are we reporting vectors of FP immediates as legal?
807 bool ForCodeSize) const {
808 EVT ScalarVT = VT.getScalarType();
809 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
810 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
811}
812
813// We don't want to shrink f64 / f32 constants.
815 EVT ScalarVT = VT.getScalarType();
816 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
817}
818
820 ISD::LoadExtType ExtTy,
821 EVT NewVT) const {
822 // TODO: This may be worth removing. Check regression tests for diffs.
824 return false;
825
826 unsigned NewSize = NewVT.getStoreSizeInBits();
827
828 // If we are reducing to a 32-bit load or a smaller multi-dword load,
829 // this is always better.
830 if (NewSize >= 32)
831 return true;
832
833 EVT OldVT = N->getValueType(0);
834 unsigned OldSize = OldVT.getStoreSizeInBits();
835
836 MemSDNode *MN = cast<MemSDNode>(N);
837 unsigned AS = MN->getAddressSpace();
838 // Do not shrink an aligned scalar load to sub-dword.
839 // Scalar engine cannot do sub-dword loads.
840 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
841 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
844 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
845 MN->isInvariant())) &&
847 return false;
848
849 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
850 // extloads, so doing one requires using a buffer_load. In cases where we
851 // still couldn't use a scalar load, using the wider load shouldn't really
852 // hurt anything.
853
854 // If the old size already had to be an extload, there's no harm in continuing
855 // to reduce the width.
856 return (OldSize < 32);
857}
858
860 const SelectionDAG &DAG,
861 const MachineMemOperand &MMO) const {
862
863 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
864
865 if (LoadTy.getScalarType() == MVT::i32)
866 return false;
867
868 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
869 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
870
871 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
872 return false;
873
874 unsigned Fast = 0;
876 CastTy, MMO, &Fast) &&
877 Fast;
878}
879
880// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
881// profitable with the expansion for 64-bit since it's generally good to
882// speculate things.
884 return true;
885}
886
888 return true;
889}
890
892 switch (N->getOpcode()) {
893 case ISD::EntryToken:
894 case ISD::TokenFactor:
895 return true;
897 unsigned IntrID = N->getConstantOperandVal(0);
899 }
900 case ISD::LOAD:
901 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
903 return true;
904 return false;
905 case AMDGPUISD::SETCC: // ballot-style instruction
906 return true;
907 }
908 return false;
909}
910
912 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
913 NegatibleCost &Cost, unsigned Depth) const {
914
915 switch (Op.getOpcode()) {
916 case ISD::FMA:
917 case ISD::FMAD: {
918 // Negating a fma is not free if it has users without source mods.
919 if (!allUsesHaveSourceMods(Op.getNode()))
920 return SDValue();
921 break;
922 }
923 case AMDGPUISD::RCP: {
924 SDValue Src = Op.getOperand(0);
925 EVT VT = Op.getValueType();
926 SDLoc SL(Op);
927
928 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
929 ForCodeSize, Cost, Depth + 1);
930 if (NegSrc)
931 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
932 return SDValue();
933 }
934 default:
935 break;
936 }
937
938 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
939 ForCodeSize, Cost, Depth);
940}
941
942//===---------------------------------------------------------------------===//
943// Target Properties
944//===---------------------------------------------------------------------===//
945
948
949 // Packed operations do not have a fabs modifier.
950 return VT == MVT::f32 || VT == MVT::f64 ||
951 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
952}
953
956 // Report this based on the end legalized type.
957 VT = VT.getScalarType();
958 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
959}
960
962 unsigned NumElem,
963 unsigned AS) const {
964 return true;
965}
966
968 // There are few operations which truly have vector input operands. Any vector
969 // operation is going to involve operations on each component, and a
970 // build_vector will be a copy per element, so it always makes sense to use a
971 // build_vector input in place of the extracted element to avoid a copy into a
972 // super register.
973 //
974 // We should probably only do this if all users are extracts only, but this
975 // should be the common case.
976 return true;
977}
978
980 // Truncate is just accessing a subregister.
981
982 unsigned SrcSize = Source.getSizeInBits();
983 unsigned DestSize = Dest.getSizeInBits();
984
985 return DestSize < SrcSize && DestSize % 32 == 0 ;
986}
987
989 // Truncate is just accessing a subregister.
990
991 unsigned SrcSize = Source->getScalarSizeInBits();
992 unsigned DestSize = Dest->getScalarSizeInBits();
993
994 if (DestSize== 16 && Subtarget->has16BitInsts())
995 return SrcSize >= 32;
996
997 return DestSize < SrcSize && DestSize % 32 == 0;
998}
999
1001 unsigned SrcSize = Src->getScalarSizeInBits();
1002 unsigned DestSize = Dest->getScalarSizeInBits();
1003
1004 if (SrcSize == 16 && Subtarget->has16BitInsts())
1005 return DestSize >= 32;
1006
1007 return SrcSize == 32 && DestSize == 64;
1008}
1009
1011 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1012 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1013 // this will enable reducing 64-bit operations the 32-bit, which is always
1014 // good.
1015
1016 if (Src == MVT::i16)
1017 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1018
1019 return Src == MVT::i32 && Dest == MVT::i64;
1020}
1021
1023 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1024 // limited number of native 64-bit operations. Shrinking an operation to fit
1025 // in a single 32-bit register should always be helpful. As currently used,
1026 // this is much less general than the name suggests, and is only used in
1027 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1028 // not profitable, and may actually be harmful.
1029 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1030}
1031
1033 const SDNode* N, CombineLevel Level) const {
1034 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1035 N->getOpcode() == ISD::SRL) &&
1036 "Expected shift op");
1037 // Always commute pre-type legalization and right shifts.
1038 // We're looking for shl(or(x,y),z) patterns.
1040 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1041 return true;
1042
1043 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1044 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1045 (N->use_begin()->getOpcode() == ISD::SRA ||
1046 N->use_begin()->getOpcode() == ISD::SRL))
1047 return false;
1048
1049 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1050 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1051 if (LHS.getOpcode() != ISD::SHL)
1052 return false;
1053 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1054 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1055 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1056 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1057 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1058 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1059 };
1060 SDValue LHS = N->getOperand(0).getOperand(0);
1061 SDValue RHS = N->getOperand(0).getOperand(1);
1062 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1063}
1064
1065//===---------------------------------------------------------------------===//
1066// TargetLowering Callbacks
1067//===---------------------------------------------------------------------===//
1068
1070 bool IsVarArg) {
1071 switch (CC) {
1079 return CC_AMDGPU;
1082 return CC_AMDGPU_CS_CHAIN;
1083 case CallingConv::C:
1084 case CallingConv::Fast:
1085 case CallingConv::Cold:
1086 return CC_AMDGPU_Func;
1088 return CC_SI_Gfx;
1091 default:
1092 report_fatal_error("Unsupported calling convention for call");
1093 }
1094}
1095
1097 bool IsVarArg) {
1098 switch (CC) {
1101 llvm_unreachable("kernels should not be handled here");
1111 return RetCC_SI_Shader;
1113 return RetCC_SI_Gfx;
1114 case CallingConv::C:
1115 case CallingConv::Fast:
1116 case CallingConv::Cold:
1117 return RetCC_AMDGPU_Func;
1118 default:
1119 report_fatal_error("Unsupported calling convention.");
1120 }
1121}
1122
1123/// The SelectionDAGBuilder will automatically promote function arguments
1124/// with illegal types. However, this does not work for the AMDGPU targets
1125/// since the function arguments are stored in memory as these illegal types.
1126/// In order to handle this properly we need to get the original types sizes
1127/// from the LLVM IR Function and fixup the ISD:InputArg values before
1128/// passing them to AnalyzeFormalArguments()
1129
1130/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1131/// input values across multiple registers. Each item in the Ins array
1132/// represents a single value that will be stored in registers. Ins[x].VT is
1133/// the value type of the value that will be stored in the register, so
1134/// whatever SDNode we lower the argument to needs to be this type.
1135///
1136/// In order to correctly lower the arguments we need to know the size of each
1137/// argument. Since Ins[x].VT gives us the size of the register that will
1138/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1139/// for the original function argument so that we can deduce the correct memory
1140/// type to use for Ins[x]. In most cases the correct memory type will be
1141/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1142/// we have a kernel argument of type v8i8, this argument will be split into
1143/// 8 parts and each part will be represented by its own item in the Ins array.
1144/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1145/// the argument before it was split. From this, we deduce that the memory type
1146/// for each individual part is i8. We pass the memory type as LocVT to the
1147/// calling convention analysis function and the register type (Ins[x].VT) as
1148/// the ValVT.
1150 CCState &State,
1151 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1152 const MachineFunction &MF = State.getMachineFunction();
1153 const Function &Fn = MF.getFunction();
1154 LLVMContext &Ctx = Fn.getParent()->getContext();
1155 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1156 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1158
1159 Align MaxAlign = Align(1);
1160 uint64_t ExplicitArgOffset = 0;
1161 const DataLayout &DL = Fn.getParent()->getDataLayout();
1162
1163 unsigned InIndex = 0;
1164
1165 for (const Argument &Arg : Fn.args()) {
1166 const bool IsByRef = Arg.hasByRefAttr();
1167 Type *BaseArgTy = Arg.getType();
1168 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1169 Align Alignment = DL.getValueOrABITypeAlignment(
1170 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1171 MaxAlign = std::max(Alignment, MaxAlign);
1172 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1173
1174 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1175 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1176
1177 // We're basically throwing away everything passed into us and starting over
1178 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1179 // to us as computed in Ins.
1180 //
1181 // We also need to figure out what type legalization is trying to do to get
1182 // the correct memory offsets.
1183
1184 SmallVector<EVT, 16> ValueVTs;
1186 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1187
1188 for (unsigned Value = 0, NumValues = ValueVTs.size();
1189 Value != NumValues; ++Value) {
1190 uint64_t BasePartOffset = Offsets[Value];
1191
1192 EVT ArgVT = ValueVTs[Value];
1193 EVT MemVT = ArgVT;
1194 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1195 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1196
1197 if (NumRegs == 1) {
1198 // This argument is not split, so the IR type is the memory type.
1199 if (ArgVT.isExtended()) {
1200 // We have an extended type, like i24, so we should just use the
1201 // register type.
1202 MemVT = RegisterVT;
1203 } else {
1204 MemVT = ArgVT;
1205 }
1206 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1207 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1208 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1209 // We have a vector value which has been split into a vector with
1210 // the same scalar type, but fewer elements. This should handle
1211 // all the floating-point vector types.
1212 MemVT = RegisterVT;
1213 } else if (ArgVT.isVector() &&
1214 ArgVT.getVectorNumElements() == NumRegs) {
1215 // This arg has been split so that each element is stored in a separate
1216 // register.
1217 MemVT = ArgVT.getScalarType();
1218 } else if (ArgVT.isExtended()) {
1219 // We have an extended type, like i65.
1220 MemVT = RegisterVT;
1221 } else {
1222 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1223 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1224 if (RegisterVT.isInteger()) {
1225 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1226 } else if (RegisterVT.isVector()) {
1227 assert(!RegisterVT.getScalarType().isFloatingPoint());
1228 unsigned NumElements = RegisterVT.getVectorNumElements();
1229 assert(MemoryBits % NumElements == 0);
1230 // This vector type has been split into another vector type with
1231 // a different elements size.
1232 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1233 MemoryBits / NumElements);
1234 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1235 } else {
1236 llvm_unreachable("cannot deduce memory type.");
1237 }
1238 }
1239
1240 // Convert one element vectors to scalar.
1241 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1242 MemVT = MemVT.getScalarType();
1243
1244 // Round up vec3/vec5 argument.
1245 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1246 assert(MemVT.getVectorNumElements() == 3 ||
1247 MemVT.getVectorNumElements() == 5 ||
1248 (MemVT.getVectorNumElements() >= 9 &&
1249 MemVT.getVectorNumElements() <= 12));
1250 MemVT = MemVT.getPow2VectorType(State.getContext());
1251 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1252 MemVT = MemVT.getRoundIntegerType(State.getContext());
1253 }
1254
1255 unsigned PartOffset = 0;
1256 for (unsigned i = 0; i != NumRegs; ++i) {
1257 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1258 BasePartOffset + PartOffset,
1259 MemVT.getSimpleVT(),
1261 PartOffset += MemVT.getStoreSize();
1262 }
1263 }
1264 }
1265}
1266
1268 SDValue Chain, CallingConv::ID CallConv,
1269 bool isVarArg,
1271 const SmallVectorImpl<SDValue> &OutVals,
1272 const SDLoc &DL, SelectionDAG &DAG) const {
1273 // FIXME: Fails for r600 tests
1274 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1275 // "wave terminate should not have return values");
1276 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1277}
1278
1279//===---------------------------------------------------------------------===//
1280// Target specific lowering
1281//===---------------------------------------------------------------------===//
1282
1283/// Selects the correct CCAssignFn for a given CallingConvention value.
1285 bool IsVarArg) {
1286 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1287}
1288
1290 bool IsVarArg) {
1292}
1293
1295 SelectionDAG &DAG,
1296 MachineFrameInfo &MFI,
1297 int ClobberedFI) const {
1298 SmallVector<SDValue, 8> ArgChains;
1299 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1300 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1301
1302 // Include the original chain at the beginning of the list. When this is
1303 // used by target LowerCall hooks, this helps legalize find the
1304 // CALLSEQ_BEGIN node.
1305 ArgChains.push_back(Chain);
1306
1307 // Add a chain value for each stack argument corresponding
1308 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1309 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1310 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1311 if (FI->getIndex() < 0) {
1312 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1313 int64_t InLastByte = InFirstByte;
1314 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1315
1316 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1317 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1318 ArgChains.push_back(SDValue(L, 1));
1319 }
1320 }
1321 }
1322 }
1323
1324 // Build a tokenfactor for all the chains.
1325 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1326}
1327
1330 StringRef Reason) const {
1331 SDValue Callee = CLI.Callee;
1332 SelectionDAG &DAG = CLI.DAG;
1333
1334 const Function &Fn = DAG.getMachineFunction().getFunction();
1335
1336 StringRef FuncName("<unknown>");
1337
1338 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1339 FuncName = G->getSymbol();
1340 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1341 FuncName = G->getGlobal()->getName();
1342
1344 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1345 DAG.getContext()->diagnose(NoCalls);
1346
1347 if (!CLI.IsTailCall) {
1348 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1349 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1350 }
1351
1352 return DAG.getEntryNode();
1353}
1354
1356 SmallVectorImpl<SDValue> &InVals) const {
1357 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1358}
1359
1361 SelectionDAG &DAG) const {
1362 const Function &Fn = DAG.getMachineFunction().getFunction();
1363
1364 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1365 SDLoc(Op).getDebugLoc());
1366 DAG.getContext()->diagnose(NoDynamicAlloca);
1367 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1368 return DAG.getMergeValues(Ops, SDLoc());
1369}
1370
1372 SelectionDAG &DAG) const {
1373 switch (Op.getOpcode()) {
1374 default:
1375 Op->print(errs(), &DAG);
1376 llvm_unreachable("Custom lowering code for this "
1377 "instruction is not implemented yet!");
1378 break;
1380 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1382 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1383 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1384 case ISD::FREM: return LowerFREM(Op, DAG);
1385 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1386 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1387 case ISD::FRINT: return LowerFRINT(Op, DAG);
1388 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1389 case ISD::FROUNDEVEN:
1390 return LowerFROUNDEVEN(Op, DAG);
1391 case ISD::FROUND: return LowerFROUND(Op, DAG);
1392 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1393 case ISD::FLOG2:
1394 return LowerFLOG2(Op, DAG);
1395 case ISD::FLOG:
1396 case ISD::FLOG10:
1397 return LowerFLOGCommon(Op, DAG);
1398 case ISD::FEXP:
1399 case ISD::FEXP10:
1400 return lowerFEXP(Op, DAG);
1401 case ISD::FEXP2:
1402 return lowerFEXP2(Op, DAG);
1403 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1404 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1405 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1406 case ISD::FP_TO_SINT:
1407 case ISD::FP_TO_UINT:
1408 return LowerFP_TO_INT(Op, DAG);
1409 case ISD::CTTZ:
1411 case ISD::CTLZ:
1413 return LowerCTLZ_CTTZ(Op, DAG);
1415 }
1416 return Op;
1417}
1418
1421 SelectionDAG &DAG) const {
1422 switch (N->getOpcode()) {
1424 // Different parts of legalization seem to interpret which type of
1425 // sign_extend_inreg is the one to check for custom lowering. The extended
1426 // from type is what really matters, but some places check for custom
1427 // lowering of the result type. This results in trying to use
1428 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1429 // nothing here and let the illegal result integer be handled normally.
1430 return;
1431 case ISD::FLOG2:
1432 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1433 Results.push_back(Lowered);
1434 return;
1435 case ISD::FLOG:
1436 case ISD::FLOG10:
1437 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1438 Results.push_back(Lowered);
1439 return;
1440 case ISD::FEXP2:
1441 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1442 Results.push_back(Lowered);
1443 return;
1444 case ISD::FEXP:
1445 case ISD::FEXP10:
1446 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1447 Results.push_back(Lowered);
1448 return;
1449 case ISD::CTLZ:
1451 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1452 Results.push_back(Lowered);
1453 return;
1454 default:
1455 return;
1456 }
1457}
1458
1460 SDValue Op,
1461 SelectionDAG &DAG) const {
1462
1463 const DataLayout &DL = DAG.getDataLayout();
1464 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1465 const GlobalValue *GV = G->getGlobal();
1466
1467 if (!MFI->isModuleEntryFunction()) {
1468 if (std::optional<uint32_t> Address =
1470 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1471 }
1472 }
1473
1474 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1475 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1476 if (!MFI->isModuleEntryFunction() &&
1477 GV->getName() != "llvm.amdgcn.module.lds") {
1478 SDLoc DL(Op);
1479 const Function &Fn = DAG.getMachineFunction().getFunction();
1480 DiagnosticInfoUnsupported BadLDSDecl(
1481 Fn, "local memory global used by non-kernel function",
1482 DL.getDebugLoc(), DS_Warning);
1483 DAG.getContext()->diagnose(BadLDSDecl);
1484
1485 // We currently don't have a way to correctly allocate LDS objects that
1486 // aren't directly associated with a kernel. We do force inlining of
1487 // functions that use local objects. However, if these dead functions are
1488 // not eliminated, we don't want a compile time error. Just emit a warning
1489 // and a trap, since there should be no callable path here.
1490 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1491 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1492 Trap, DAG.getRoot());
1493 DAG.setRoot(OutputChain);
1494 return DAG.getUNDEF(Op.getValueType());
1495 }
1496
1497 // XXX: What does the value of G->getOffset() mean?
1498 assert(G->getOffset() == 0 &&
1499 "Do not know what to do with an non-zero offset");
1500
1501 // TODO: We could emit code to handle the initialization somewhere.
1502 // We ignore the initializer for now and legalize it to allow selection.
1503 // The initializer will anyway get errored out during assembly emission.
1504 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1505 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1506 }
1507 return SDValue();
1508}
1509
1511 SelectionDAG &DAG) const {
1513 SDLoc SL(Op);
1514
1515 EVT VT = Op.getValueType();
1516 if (VT.getVectorElementType().getSizeInBits() < 32) {
1517 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1518 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1519 unsigned NewNumElt = OpBitSize / 32;
1520 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1522 MVT::i32, NewNumElt);
1523 for (const SDUse &U : Op->ops()) {
1524 SDValue In = U.get();
1525 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1526 if (NewNumElt > 1)
1527 DAG.ExtractVectorElements(NewIn, Args);
1528 else
1529 Args.push_back(NewIn);
1530 }
1531
1532 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1533 NewNumElt * Op.getNumOperands());
1534 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1535 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1536 }
1537 }
1538
1539 for (const SDUse &U : Op->ops())
1540 DAG.ExtractVectorElements(U.get(), Args);
1541
1542 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1543}
1544
1546 SelectionDAG &DAG) const {
1547 SDLoc SL(Op);
1549 unsigned Start = Op.getConstantOperandVal(1);
1550 EVT VT = Op.getValueType();
1551 EVT SrcVT = Op.getOperand(0).getValueType();
1552
1553 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1554 unsigned NumElt = VT.getVectorNumElements();
1555 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1556 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1557
1558 // Extract 32-bit registers at a time.
1559 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1560 EVT NewVT = NumElt == 2
1561 ? MVT::i32
1562 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1563 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1564
1565 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1566 if (NumElt == 2)
1567 Tmp = Args[0];
1568 else
1569 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1570
1571 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1572 }
1573
1574 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1576
1577 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1578}
1579
1580// TODO: Handle fabs too
1582 if (Val.getOpcode() == ISD::FNEG)
1583 return Val.getOperand(0);
1584
1585 return Val;
1586}
1587
1589 if (Val.getOpcode() == ISD::FNEG)
1590 Val = Val.getOperand(0);
1591 if (Val.getOpcode() == ISD::FABS)
1592 Val = Val.getOperand(0);
1593 if (Val.getOpcode() == ISD::FCOPYSIGN)
1594 Val = Val.getOperand(0);
1595 return Val;
1596}
1597
1599 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1600 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1601 SelectionDAG &DAG = DCI.DAG;
1602 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1603 switch (CCOpcode) {
1604 case ISD::SETOEQ:
1605 case ISD::SETONE:
1606 case ISD::SETUNE:
1607 case ISD::SETNE:
1608 case ISD::SETUEQ:
1609 case ISD::SETEQ:
1610 case ISD::SETFALSE:
1611 case ISD::SETFALSE2:
1612 case ISD::SETTRUE:
1613 case ISD::SETTRUE2:
1614 case ISD::SETUO:
1615 case ISD::SETO:
1616 break;
1617 case ISD::SETULE:
1618 case ISD::SETULT: {
1619 if (LHS == True)
1620 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1621 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1622 }
1623 case ISD::SETOLE:
1624 case ISD::SETOLT:
1625 case ISD::SETLE:
1626 case ISD::SETLT: {
1627 // Ordered. Assume ordered for undefined.
1628
1629 // Only do this after legalization to avoid interfering with other combines
1630 // which might occur.
1632 !DCI.isCalledByLegalizer())
1633 return SDValue();
1634
1635 // We need to permute the operands to get the correct NaN behavior. The
1636 // selected operand is the second one based on the failing compare with NaN,
1637 // so permute it based on the compare type the hardware uses.
1638 if (LHS == True)
1639 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1640 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1641 }
1642 case ISD::SETUGE:
1643 case ISD::SETUGT: {
1644 if (LHS == True)
1645 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1646 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1647 }
1648 case ISD::SETGT:
1649 case ISD::SETGE:
1650 case ISD::SETOGE:
1651 case ISD::SETOGT: {
1653 !DCI.isCalledByLegalizer())
1654 return SDValue();
1655
1656 if (LHS == True)
1657 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1658 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1659 }
1660 case ISD::SETCC_INVALID:
1661 llvm_unreachable("Invalid setcc condcode!");
1662 }
1663 return SDValue();
1664}
1665
1666/// Generate Min/Max node
1668 SDValue LHS, SDValue RHS,
1669 SDValue True, SDValue False,
1670 SDValue CC,
1671 DAGCombinerInfo &DCI) const {
1672 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1673 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1674
1675 SelectionDAG &DAG = DCI.DAG;
1676
1677 // If we can't directly match this, try to see if we can fold an fneg to
1678 // match.
1679
1680 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1681 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1682 SDValue NegTrue = peekFNeg(True);
1683
1684 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1685 // fmin/fmax.
1686 //
1687 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1688 // -> fneg (fmin_legacy lhs, K)
1689 //
1690 // TODO: Use getNegatedExpression
1691 if (LHS == NegTrue && CFalse && CRHS) {
1692 APFloat NegRHS = neg(CRHS->getValueAPF());
1693 if (NegRHS == CFalse->getValueAPF()) {
1694 SDValue Combined =
1695 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1696 if (Combined)
1697 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1698 return SDValue();
1699 }
1700 }
1701
1702 return SDValue();
1703}
1704
1705std::pair<SDValue, SDValue>
1707 SDLoc SL(Op);
1708
1709 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1710
1711 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1712 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1713
1714 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1715 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1716
1717 return std::pair(Lo, Hi);
1718}
1719
1721 SDLoc SL(Op);
1722
1723 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1724 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1725 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1726}
1727
1729 SDLoc SL(Op);
1730
1731 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1732 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1733 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1734}
1735
1736// Split a vector type into two parts. The first part is a power of two vector.
1737// The second part is whatever is left over, and is a scalar if it would
1738// otherwise be a 1-vector.
1739std::pair<EVT, EVT>
1741 EVT LoVT, HiVT;
1742 EVT EltVT = VT.getVectorElementType();
1743 unsigned NumElts = VT.getVectorNumElements();
1744 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1745 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1746 HiVT = NumElts - LoNumElts == 1
1747 ? EltVT
1748 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1749 return std::pair(LoVT, HiVT);
1750}
1751
1752// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1753// scalar.
1754std::pair<SDValue, SDValue>
1756 const EVT &LoVT, const EVT &HiVT,
1757 SelectionDAG &DAG) const {
1759 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1760 N.getValueType().getVectorNumElements() &&
1761 "More vector elements requested than available!");
1763 DAG.getVectorIdxConstant(0, DL));
1764 SDValue Hi = DAG.getNode(
1766 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1767 return std::pair(Lo, Hi);
1768}
1769
1771 SelectionDAG &DAG) const {
1772 LoadSDNode *Load = cast<LoadSDNode>(Op);
1773 EVT VT = Op.getValueType();
1774 SDLoc SL(Op);
1775
1776
1777 // If this is a 2 element vector, we really want to scalarize and not create
1778 // weird 1 element vectors.
1779 if (VT.getVectorNumElements() == 2) {
1780 SDValue Ops[2];
1781 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1782 return DAG.getMergeValues(Ops, SL);
1783 }
1784
1785 SDValue BasePtr = Load->getBasePtr();
1786 EVT MemVT = Load->getMemoryVT();
1787
1788 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1789
1790 EVT LoVT, HiVT;
1791 EVT LoMemVT, HiMemVT;
1792 SDValue Lo, Hi;
1793
1794 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1795 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1796 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1797
1798 unsigned Size = LoMemVT.getStoreSize();
1799 Align BaseAlign = Load->getAlign();
1800 Align HiAlign = commonAlignment(BaseAlign, Size);
1801
1802 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1803 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1804 BaseAlign, Load->getMemOperand()->getFlags());
1805 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1806 SDValue HiLoad =
1807 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1808 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1809 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1810
1811 SDValue Join;
1812 if (LoVT == HiVT) {
1813 // This is the case that the vector is power of two so was evenly split.
1814 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1815 } else {
1816 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1817 DAG.getVectorIdxConstant(0, SL));
1818 Join = DAG.getNode(
1820 VT, Join, HiLoad,
1822 }
1823
1824 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1825 LoLoad.getValue(1), HiLoad.getValue(1))};
1826
1827 return DAG.getMergeValues(Ops, SL);
1828}
1829
1831 SelectionDAG &DAG) const {
1832 LoadSDNode *Load = cast<LoadSDNode>(Op);
1833 EVT VT = Op.getValueType();
1834 SDValue BasePtr = Load->getBasePtr();
1835 EVT MemVT = Load->getMemoryVT();
1836 SDLoc SL(Op);
1837 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1838 Align BaseAlign = Load->getAlign();
1839 unsigned NumElements = MemVT.getVectorNumElements();
1840
1841 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1842 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1843 if (NumElements != 3 ||
1844 (BaseAlign < Align(8) &&
1845 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1846 return SplitVectorLoad(Op, DAG);
1847
1848 assert(NumElements == 3);
1849
1850 EVT WideVT =
1852 EVT WideMemVT =
1854 SDValue WideLoad = DAG.getExtLoad(
1855 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1856 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1857 return DAG.getMergeValues(
1858 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1859 DAG.getVectorIdxConstant(0, SL)),
1860 WideLoad.getValue(1)},
1861 SL);
1862}
1863
1865 SelectionDAG &DAG) const {
1866 StoreSDNode *Store = cast<StoreSDNode>(Op);
1867 SDValue Val = Store->getValue();
1868 EVT VT = Val.getValueType();
1869
1870 // If this is a 2 element vector, we really want to scalarize and not create
1871 // weird 1 element vectors.
1872 if (VT.getVectorNumElements() == 2)
1873 return scalarizeVectorStore(Store, DAG);
1874
1875 EVT MemVT = Store->getMemoryVT();
1876 SDValue Chain = Store->getChain();
1877 SDValue BasePtr = Store->getBasePtr();
1878 SDLoc SL(Op);
1879
1880 EVT LoVT, HiVT;
1881 EVT LoMemVT, HiMemVT;
1882 SDValue Lo, Hi;
1883
1884 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1885 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1886 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1887
1888 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1889
1890 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1891 Align BaseAlign = Store->getAlign();
1892 unsigned Size = LoMemVT.getStoreSize();
1893 Align HiAlign = commonAlignment(BaseAlign, Size);
1894
1895 SDValue LoStore =
1896 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1897 Store->getMemOperand()->getFlags());
1898 SDValue HiStore =
1899 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1900 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1901
1902 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1903}
1904
1905// This is a shortcut for integer division because we have fast i32<->f32
1906// conversions, and fast f32 reciprocal instructions. The fractional part of a
1907// float is enough to accurately represent up to a 24-bit signed integer.
1909 bool Sign) const {
1910 SDLoc DL(Op);
1911 EVT VT = Op.getValueType();
1912 SDValue LHS = Op.getOperand(0);
1913 SDValue RHS = Op.getOperand(1);
1914 MVT IntVT = MVT::i32;
1915 MVT FltVT = MVT::f32;
1916
1917 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1918 if (LHSSignBits < 9)
1919 return SDValue();
1920
1921 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1922 if (RHSSignBits < 9)
1923 return SDValue();
1924
1925 unsigned BitSize = VT.getSizeInBits();
1926 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1927 unsigned DivBits = BitSize - SignBits;
1928 if (Sign)
1929 ++DivBits;
1930
1933
1934 SDValue jq = DAG.getConstant(1, DL, IntVT);
1935
1936 if (Sign) {
1937 // char|short jq = ia ^ ib;
1938 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1939
1940 // jq = jq >> (bitsize - 2)
1941 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1942 DAG.getConstant(BitSize - 2, DL, VT));
1943
1944 // jq = jq | 0x1
1945 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1946 }
1947
1948 // int ia = (int)LHS;
1949 SDValue ia = LHS;
1950
1951 // int ib, (int)RHS;
1952 SDValue ib = RHS;
1953
1954 // float fa = (float)ia;
1955 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1956
1957 // float fb = (float)ib;
1958 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1959
1960 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1961 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1962
1963 // fq = trunc(fq);
1964 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1965
1966 // float fqneg = -fq;
1967 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1968
1970
1971 bool UseFmadFtz = false;
1972 if (Subtarget->isGCN()) {
1974 UseFmadFtz =
1976 }
1977
1978 // float fr = mad(fqneg, fb, fa);
1979 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1980 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1982 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1983
1984 // int iq = (int)fq;
1985 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1986
1987 // fr = fabs(fr);
1988 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1989
1990 // fb = fabs(fb);
1991 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1992
1993 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1994
1995 // int cv = fr >= fb;
1996 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1997
1998 // jq = (cv ? jq : 0);
1999 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2000
2001 // dst = iq + jq;
2002 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2003
2004 // Rem needs compensation, it's easier to recompute it
2005 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2006 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2007
2008 // Truncate to number of bits this divide really is.
2009 if (Sign) {
2010 SDValue InRegSize
2011 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2012 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2013 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2014 } else {
2015 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2016 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2017 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2018 }
2019
2020 return DAG.getMergeValues({ Div, Rem }, DL);
2021}
2022
2024 SelectionDAG &DAG,
2026 SDLoc DL(Op);
2027 EVT VT = Op.getValueType();
2028
2029 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2030
2031 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2032
2033 SDValue One = DAG.getConstant(1, DL, HalfVT);
2034 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2035
2036 //HiLo split
2037 SDValue LHS_Lo, LHS_Hi;
2038 SDValue LHS = Op.getOperand(0);
2039 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2040
2041 SDValue RHS_Lo, RHS_Hi;
2042 SDValue RHS = Op.getOperand(1);
2043 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2044
2045 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2047
2048 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2049 LHS_Lo, RHS_Lo);
2050
2051 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2052 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2053
2054 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2055 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2056 return;
2057 }
2058
2059 if (isTypeLegal(MVT::i64)) {
2060 // The algorithm here is based on ideas from "Software Integer Division",
2061 // Tom Rodeheffer, August 2008.
2062
2065
2066 // Compute denominator reciprocal.
2067 unsigned FMAD =
2068 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2071 : (unsigned)AMDGPUISD::FMAD_FTZ;
2072
2073 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2074 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2075 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2076 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2077 Cvt_Lo);
2078 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2079 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2080 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2081 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2082 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2083 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2084 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2085 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2086 Mul1);
2087 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2088 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2089 SDValue Rcp64 = DAG.getBitcast(VT,
2090 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2091
2092 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2093 SDValue One64 = DAG.getConstant(1, DL, VT);
2094 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2095 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2096
2097 // First round of UNR (Unsigned integer Newton-Raphson).
2098 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2099 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2100 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2101 SDValue Mulhi1_Lo, Mulhi1_Hi;
2102 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2103 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2104 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2105 Mulhi1_Lo, Zero1);
2106 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2107 Mulhi1_Hi, Add1_Lo.getValue(1));
2108 SDValue Add1 = DAG.getBitcast(VT,
2109 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2110
2111 // Second round of UNR.
2112 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2113 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2114 SDValue Mulhi2_Lo, Mulhi2_Hi;
2115 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2116 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2117 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2118 Mulhi2_Lo, Zero1);
2119 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2120 Mulhi2_Hi, Add2_Lo.getValue(1));
2121 SDValue Add2 = DAG.getBitcast(VT,
2122 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2123
2124 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2125
2126 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2127
2128 SDValue Mul3_Lo, Mul3_Hi;
2129 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2130 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2131 Mul3_Lo, Zero1);
2132 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2133 Mul3_Hi, Sub1_Lo.getValue(1));
2134 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2135 SDValue Sub1 = DAG.getBitcast(VT,
2136 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2137
2138 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2139 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2140 ISD::SETUGE);
2141 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2142 ISD::SETUGE);
2143 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2144
2145 // TODO: Here and below portions of the code can be enclosed into if/endif.
2146 // Currently control flow is unconditional and we have 4 selects after
2147 // potential endif to substitute PHIs.
2148
2149 // if C3 != 0 ...
2150 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2151 RHS_Lo, Zero1);
2152 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2153 RHS_Hi, Sub1_Lo.getValue(1));
2154 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2155 Zero, Sub2_Lo.getValue(1));
2156 SDValue Sub2 = DAG.getBitcast(VT,
2157 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2158
2159 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2160
2161 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2162 ISD::SETUGE);
2163 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2164 ISD::SETUGE);
2165 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2166
2167 // if (C6 != 0)
2168 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2169
2170 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2171 RHS_Lo, Zero1);
2172 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2173 RHS_Hi, Sub2_Lo.getValue(1));
2174 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2175 Zero, Sub3_Lo.getValue(1));
2176 SDValue Sub3 = DAG.getBitcast(VT,
2177 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2178
2179 // endif C6
2180 // endif C3
2181
2182 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2183 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2184
2185 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2186 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2187
2188 Results.push_back(Div);
2189 Results.push_back(Rem);
2190
2191 return;
2192 }
2193
2194 // r600 expandion.
2195 // Get Speculative values
2196 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2197 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2198
2199 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2200 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2201 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2202
2203 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2204 SDValue DIV_Lo = Zero;
2205
2206 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2207
2208 for (unsigned i = 0; i < halfBitWidth; ++i) {
2209 const unsigned bitPos = halfBitWidth - i - 1;
2210 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2211 // Get value of high bit
2212 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2213 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2214 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2215
2216 // Shift
2217 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2218 // Add LHS high bit
2219 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2220
2221 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2222 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2223
2224 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2225
2226 // Update REM
2227 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2228 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2229 }
2230
2231 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2232 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2233 Results.push_back(DIV);
2234 Results.push_back(REM);
2235}
2236
2238 SelectionDAG &DAG) const {
2239 SDLoc DL(Op);
2240 EVT VT = Op.getValueType();
2241
2242 if (VT == MVT::i64) {
2244 LowerUDIVREM64(Op, DAG, Results);
2245 return DAG.getMergeValues(Results, DL);
2246 }
2247
2248 if (VT == MVT::i32) {
2249 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2250 return Res;
2251 }
2252
2253 SDValue X = Op.getOperand(0);
2254 SDValue Y = Op.getOperand(1);
2255
2256 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2257 // algorithm used here.
2258
2259 // Initial estimate of inv(y).
2260 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2261
2262 // One round of UNR.
2263 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2264 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2265 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2266 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2267
2268 // Quotient/remainder estimate.
2269 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2270 SDValue R =
2271 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2272
2273 // First quotient/remainder refinement.
2274 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2275 SDValue One = DAG.getConstant(1, DL, VT);
2276 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2277 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2278 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2279 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2280 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2281
2282 // Second quotient/remainder refinement.
2283 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2284 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2285 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2286 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2287 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2288
2289 return DAG.getMergeValues({Q, R}, DL);
2290}
2291
2293 SelectionDAG &DAG) const {
2294 SDLoc DL(Op);
2295 EVT VT = Op.getValueType();
2296
2297 SDValue LHS = Op.getOperand(0);
2298 SDValue RHS = Op.getOperand(1);
2299
2300 SDValue Zero = DAG.getConstant(0, DL, VT);
2301 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2302
2303 if (VT == MVT::i32) {
2304 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2305 return Res;
2306 }
2307
2308 if (VT == MVT::i64 &&
2309 DAG.ComputeNumSignBits(LHS) > 32 &&
2310 DAG.ComputeNumSignBits(RHS) > 32) {
2311 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2312
2313 //HiLo split
2314 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2315 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2316 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2317 LHS_Lo, RHS_Lo);
2318 SDValue Res[2] = {
2319 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2320 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2321 };
2322 return DAG.getMergeValues(Res, DL);
2323 }
2324
2325 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2326 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2327 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2328 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2329
2330 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2331 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2332
2333 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2334 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2335
2336 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2337 SDValue Rem = Div.getValue(1);
2338
2339 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2340 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2341
2342 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2343 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2344
2345 SDValue Res[2] = {
2346 Div,
2347 Rem
2348 };
2349 return DAG.getMergeValues(Res, DL);
2350}
2351
2352// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2354 SDLoc SL(Op);
2355 EVT VT = Op.getValueType();
2356 auto Flags = Op->getFlags();
2357 SDValue X = Op.getOperand(0);
2358 SDValue Y = Op.getOperand(1);
2359
2360 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2361 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2362 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2363 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2364 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2365}
2366
2368 SDLoc SL(Op);
2369 SDValue Src = Op.getOperand(0);
2370
2371 // result = trunc(src)
2372 // if (src > 0.0 && src != result)
2373 // result += 1.0
2374
2375 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2376
2377 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2378 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2379
2380 EVT SetCCVT =
2381 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2382
2383 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2384 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2385 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2386
2387 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2388 // TODO: Should this propagate fast-math-flags?
2389 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2390}
2391
2393 SelectionDAG &DAG) {
2394 const unsigned FractBits = 52;
2395 const unsigned ExpBits = 11;
2396
2397 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2398 Hi,
2399 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2400 DAG.getConstant(ExpBits, SL, MVT::i32));
2401 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2402 DAG.getConstant(1023, SL, MVT::i32));
2403
2404 return Exp;
2405}
2406
2408 SDLoc SL(Op);
2409 SDValue Src = Op.getOperand(0);
2410
2411 assert(Op.getValueType() == MVT::f64);
2412
2413 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2414
2415 // Extract the upper half, since this is where we will find the sign and
2416 // exponent.
2417 SDValue Hi = getHiHalf64(Src, DAG);
2418
2419 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2420
2421 const unsigned FractBits = 52;
2422
2423 // Extract the sign bit.
2424 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2425 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2426
2427 // Extend back to 64-bits.
2428 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2429 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2430
2431 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2432 const SDValue FractMask
2433 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2434
2435 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2436 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2437 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2438
2439 EVT SetCCVT =
2440 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2441
2442 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2443
2444 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2445 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2446
2447 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2448 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2449
2450 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2451}
2452
2454 SelectionDAG &DAG) const {
2455 SDLoc SL(Op);
2456 SDValue Src = Op.getOperand(0);
2457
2458 assert(Op.getValueType() == MVT::f64);
2459
2460 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2461 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2462 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2463
2464 // TODO: Should this propagate fast-math-flags?
2465
2466 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2467 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2468
2469 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2470
2471 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2472 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2473
2474 EVT SetCCVT =
2475 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2476 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2477
2478 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2479}
2480
2482 SelectionDAG &DAG) const {
2483 // FNEARBYINT and FRINT are the same, except in their handling of FP
2484 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2485 // rint, so just treat them as equivalent.
2486 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2487 Op.getOperand(0));
2488}
2489
2491 auto VT = Op.getValueType();
2492 auto Arg = Op.getOperand(0u);
2493 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2494}
2495
2496// XXX - May require not supporting f32 denormals?
2497
2498// Don't handle v2f16. The extra instructions to scalarize and repack around the
2499// compare and vselect end up producing worse code than scalarizing the whole
2500// operation.
2502 SDLoc SL(Op);
2503 SDValue X = Op.getOperand(0);
2504 EVT VT = Op.getValueType();
2505
2506 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2507
2508 // TODO: Should this propagate fast-math-flags?
2509
2510 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2511
2512 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2513
2514 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2515 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2516
2517 EVT SetCCVT =
2518 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2519
2520 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2521 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2522 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2523
2524 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2525 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2526}
2527
2529 SDLoc SL(Op);
2530 SDValue Src = Op.getOperand(0);
2531
2532 // result = trunc(src);
2533 // if (src < 0.0 && src != result)
2534 // result += -1.0.
2535
2536 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2537
2538 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2539 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2540
2541 EVT SetCCVT =
2542 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2543
2544 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2545 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2546 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2547
2548 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2549 // TODO: Should this propagate fast-math-flags?
2550 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2551}
2552
2553/// Return true if it's known that \p Src can never be an f32 denormal value.
2555 switch (Src.getOpcode()) {
2556 case ISD::FP_EXTEND:
2557 return Src.getOperand(0).getValueType() == MVT::f16;
2558 case ISD::FP16_TO_FP:
2559 case ISD::FFREXP:
2560 return true;
2562 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2563 switch (IntrinsicID) {
2564 case Intrinsic::amdgcn_frexp_mant:
2565 return true;
2566 default:
2567 return false;
2568 }
2569 }
2570 default:
2571 return false;
2572 }
2573
2574 llvm_unreachable("covered opcode switch");
2575}
2576
2578 SDNodeFlags Flags) {
2579 if (Flags.hasApproximateFuncs())
2580 return true;
2581 auto &Options = DAG.getTarget().Options;
2582 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2583}
2584
2586 SDValue Src,
2587 SDNodeFlags Flags) {
2588 return !valueIsKnownNeverF32Denorm(Src) &&
2589 DAG.getMachineFunction()
2592}
2593
2595 SDValue Src,
2596 SDNodeFlags Flags) const {
2597 SDLoc SL(Src);
2598 EVT VT = Src.getValueType();
2600 SDValue SmallestNormal =
2601 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2602
2603 // Want to scale denormals up, but negatives and 0 work just as well on the
2604 // scaled path.
2605 SDValue IsLtSmallestNormal = DAG.getSetCC(
2606 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2607 SmallestNormal, ISD::SETOLT);
2608
2609 return IsLtSmallestNormal;
2610}
2611
2613 SDNodeFlags Flags) const {
2614 SDLoc SL(Src);
2615 EVT VT = Src.getValueType();
2617 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2618
2619 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2620 SDValue IsFinite = DAG.getSetCC(
2621 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2622 Inf, ISD::SETOLT);
2623 return IsFinite;
2624}
2625
2626/// If denormal handling is required return the scaled input to FLOG2, and the
2627/// check for denormal range. Otherwise, return null values.
2628std::pair<SDValue, SDValue>
2630 SDValue Src, SDNodeFlags Flags) const {
2631 if (!needsDenormHandlingF32(DAG, Src, Flags))
2632 return {};
2633
2634 MVT VT = MVT::f32;
2635 const fltSemantics &Semantics = APFloat::IEEEsingle();
2636 SDValue SmallestNormal =
2637 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2638
2639 SDValue IsLtSmallestNormal = DAG.getSetCC(
2640 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2641 SmallestNormal, ISD::SETOLT);
2642
2643 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2644 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2645 SDValue ScaleFactor =
2646 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2647
2648 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2649 return {ScaledInput, IsLtSmallestNormal};
2650}
2651
2653 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2654 // If we have to handle denormals, scale up the input and adjust the result.
2655
2656 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2657 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2658
2659 SDLoc SL(Op);
2660 EVT VT = Op.getValueType();
2661 SDValue Src = Op.getOperand(0);
2662 SDNodeFlags Flags = Op->getFlags();
2663
2664 if (VT == MVT::f16) {
2665 // Nothing in half is a denormal when promoted to f32.
2666 assert(!Subtarget->has16BitInsts());
2667 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2668 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2669 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2670 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2671 }
2672
2673 auto [ScaledInput, IsLtSmallestNormal] =
2674 getScaledLogInput(DAG, SL, Src, Flags);
2675 if (!ScaledInput)
2676 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2677
2678 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2679
2680 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2681 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2682 SDValue ResultOffset =
2683 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2684 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2685}
2686
2687static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2688 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2689 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2690 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2691}
2692
2694 SelectionDAG &DAG) const {
2695 SDValue X = Op.getOperand(0);
2696 EVT VT = Op.getValueType();
2697 SDNodeFlags Flags = Op->getFlags();
2698 SDLoc DL(Op);
2699
2700 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2701 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2702
2703 const auto &Options = getTargetMachine().Options;
2704 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2705 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2706
2707 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2708 // Log and multiply in f32 is good enough for f16.
2709 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2710 }
2711
2712 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2713 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2714 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2715 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2716 }
2717
2718 return Lowered;
2719 }
2720
2721 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2722 if (ScaledInput)
2723 X = ScaledInput;
2724
2725 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2726
2727 SDValue R;
2728 if (Subtarget->hasFastFMAF32()) {
2729 // c+cc are ln(2)/ln(10) to more than 49 bits
2730 const float c_log10 = 0x1.344134p-2f;
2731 const float cc_log10 = 0x1.09f79ep-26f;
2732
2733 // c + cc is ln(2) to more than 49 bits
2734 const float c_log = 0x1.62e42ep-1f;
2735 const float cc_log = 0x1.efa39ep-25f;
2736
2737 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2738 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2739
2740 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2741 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2742 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2743 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2744 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2745 } else {
2746 // ch+ct is ln(2)/ln(10) to more than 36 bits
2747 const float ch_log10 = 0x1.344000p-2f;
2748 const float ct_log10 = 0x1.3509f6p-18f;
2749
2750 // ch + ct is ln(2) to more than 36 bits
2751 const float ch_log = 0x1.62e000p-1f;
2752 const float ct_log = 0x1.0bfbe8p-15f;
2753
2754 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2755 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2756
2757 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2758 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2759 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2760 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2761 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2762
2763 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2764 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2765 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2766 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2767 }
2768
2769 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2770 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2771
2772 // TODO: Check if known finite from source value.
2773 if (!IsFiniteOnly) {
2774 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2775 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2776 }
2777
2778 if (IsScaled) {
2779 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2780 SDValue ShiftK =
2781 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2782 SDValue Shift =
2783 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2784 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2785 }
2786
2787 return R;
2788}
2789
2791 return LowerFLOGCommon(Op, DAG);
2792}
2793
2794// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2795// promote f16 operation.
2797 SelectionDAG &DAG, bool IsLog10,
2798 SDNodeFlags Flags) const {
2799 EVT VT = Src.getValueType();
2800 unsigned LogOp =
2801 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2802
2803 double Log2BaseInverted =
2805
2806 if (VT == MVT::f32) {
2807 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2808 if (ScaledInput) {
2809 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2810 SDValue ScaledResultOffset =
2811 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2812
2813 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2814
2815 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2816 ScaledResultOffset, Zero, Flags);
2817
2818 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2819
2820 if (Subtarget->hasFastFMAF32())
2821 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2822 Flags);
2823 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2824 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2825 }
2826 }
2827
2828 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2829 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2830
2831 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2832 Flags);
2833}
2834
2836 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2837 // If we have to handle denormals, scale up the input and adjust the result.
2838
2839 SDLoc SL(Op);
2840 EVT VT = Op.getValueType();
2841 SDValue Src = Op.getOperand(0);
2842 SDNodeFlags Flags = Op->getFlags();
2843
2844 if (VT == MVT::f16) {
2845 // Nothing in half is a denormal when promoted to f32.
2846 assert(!Subtarget->has16BitInsts());
2847 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2848 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2849 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2850 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2851 }
2852
2853 assert(VT == MVT::f32);
2854
2855 if (!needsDenormHandlingF32(DAG, Src, Flags))
2856 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2857
2858 // bool needs_scaling = x < -0x1.f80000p+6f;
2859 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2860
2861 // -nextafter(128.0, -1)
2862 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2863
2864 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2865
2866 SDValue NeedsScaling =
2867 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2868
2869 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2870 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2871
2872 SDValue AddOffset =
2873 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2874
2875 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2876 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2877
2878 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2879 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2880 SDValue ResultScale =
2881 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2882
2883 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2884}
2885
2887 SelectionDAG &DAG,
2888 SDNodeFlags Flags) const {
2889 EVT VT = X.getValueType();
2890 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2891
2892 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2893 // exp2(M_LOG2E_F * f);
2894 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2895 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2896 : (unsigned)ISD::FEXP2,
2897 SL, VT, Mul, Flags);
2898 }
2899
2900 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2901
2902 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2903 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2904
2905 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2906
2907 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2908
2909 SDValue AdjustedX =
2910 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2911
2912 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2913
2914 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2915
2916 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2917 SDValue AdjustedResult =
2918 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2919
2920 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2921 Flags);
2922}
2923
2924/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2925/// handled correctly.
2927 SelectionDAG &DAG,
2928 SDNodeFlags Flags) const {
2929 const EVT VT = X.getValueType();
2930 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2931
2932 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2933 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2934 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2935 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2936
2937 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2938 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2939 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2940 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2941 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2942 }
2943
2944 // bool s = x < -0x1.2f7030p+5f;
2945 // x += s ? 0x1.0p+5f : 0.0f;
2946 // exp10 = exp2(x * 0x1.a92000p+1f) *
2947 // exp2(x * 0x1.4f0978p-11f) *
2948 // (s ? 0x1.9f623ep-107f : 1.0f);
2949
2950 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2951
2952 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2953 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2954
2955 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2956 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2957 SDValue AdjustedX =
2958 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2959
2960 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2961 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2962
2963 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2964 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2965 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2966 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2967
2968 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2969
2970 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2971 SDValue AdjustedResult =
2972 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2973
2974 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2975 Flags);
2976}
2977
2979 EVT VT = Op.getValueType();
2980 SDLoc SL(Op);
2981 SDValue X = Op.getOperand(0);
2982 SDNodeFlags Flags = Op->getFlags();
2983 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2984
2985 if (VT.getScalarType() == MVT::f16) {
2986 // v_exp_f16 (fmul x, log2e)
2987 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2988 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2989
2990 if (VT.isVector())
2991 return SDValue();
2992
2993 // exp(f16 x) ->
2994 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2995
2996 // Nothing in half is a denormal when promoted to f32.
2997 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2998 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2999 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3000 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3001 }
3002
3003 assert(VT == MVT::f32);
3004
3005 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3006 // library behavior. Also, is known-not-daz source sufficient?
3007 if (allowApproxFunc(DAG, Flags)) {
3008 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3009 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3010 }
3011
3012 // Algorithm:
3013 //
3014 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3015 //
3016 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3017 // n = 64*m + j, 0 <= j < 64
3018 //
3019 // e^x = 2^((64*m + j + f)/64)
3020 // = (2^m) * (2^(j/64)) * 2^(f/64)
3021 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3022 //
3023 // f = x*(64/ln(2)) - n
3024 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3025 //
3026 // e^x = (2^m) * (2^(j/64)) * e^r
3027 //
3028 // (2^(j/64)) is precomputed
3029 //
3030 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3031 // e^r = 1 + q
3032 //
3033 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3034 //
3035 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3036 SDNodeFlags FlagsNoContract = Flags;
3037 FlagsNoContract.setAllowContract(false);
3038
3039 SDValue PH, PL;
3040 if (Subtarget->hasFastFMAF32()) {
3041 const float c_exp = numbers::log2ef;
3042 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3043 const float c_exp10 = 0x1.a934f0p+1f;
3044 const float cc_exp10 = 0x1.2f346ep-24f;
3045
3046 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3047 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3048
3049 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3050 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3051 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3052 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3053 } else {
3054 const float ch_exp = 0x1.714000p+0f;
3055 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3056
3057 const float ch_exp10 = 0x1.a92000p+1f;
3058 const float cl_exp10 = 0x1.4f0978p-11f;
3059
3060 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3061 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3062
3063 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3064 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3065 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3066 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3067 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3068
3069 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3070
3071 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3072 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3073 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3074 }
3075
3076 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3077
3078 // It is unsafe to contract this fsub into the PH multiply.
3079 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3080
3081 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3082 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3083 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3084
3085 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3086
3087 SDValue UnderflowCheckConst =
3088 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3089
3090 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3091 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3092 SDValue Underflow =
3093 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3094
3095 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3096 const auto &Options = getTargetMachine().Options;
3097
3098 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3099 SDValue OverflowCheckConst =
3100 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3101 SDValue Overflow =
3102 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3103 SDValue Inf =
3105 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3106 }
3107
3108 return R;
3109}
3110
3111static bool isCtlzOpc(unsigned Opc) {
3112 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3113}
3114
3115static bool isCttzOpc(unsigned Opc) {
3116 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3117}
3118
3120 SelectionDAG &DAG) const {
3121 auto SL = SDLoc(Op);
3122 auto Opc = Op.getOpcode();
3123 auto Arg = Op.getOperand(0u);
3124 auto ResultVT = Op.getValueType();
3125
3126 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3127 return {};
3128
3129 assert(isCtlzOpc(Opc));
3130 assert(ResultVT == Arg.getValueType());
3131
3132 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3133 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3134 SDValue NewOp;
3135
3136 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3137 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3138 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3139 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3140 } else {
3141 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3142 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3143 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3144 }
3145
3146 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3147}
3148
3150 SDLoc SL(Op);
3151 SDValue Src = Op.getOperand(0);
3152
3153 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3154 bool Ctlz = isCtlzOpc(Op.getOpcode());
3155 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3156
3157 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3158 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3159 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3160
3161 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3162 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3163 // (cttz hi:lo) -> (umin (ffbl src), 32)
3164 // (ctlz_zero_undef src) -> (ffbh src)
3165 // (cttz_zero_undef src) -> (ffbl src)
3166
3167 // 64-bit scalar version produce 32-bit result
3168 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3169 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3170 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3171 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3172 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3173 if (!ZeroUndef) {
3174 const SDValue ConstVal = DAG.getConstant(
3175 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3176 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3177 }
3178 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3179 }
3180
3181 SDValue Lo, Hi;
3182 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3183
3184 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3185 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3186
3187 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3188 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3189 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3190 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3191
3192 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3193 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3194 if (Ctlz)
3195 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3196 else
3197 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3198
3199 SDValue NewOpr;
3200 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3201 if (!ZeroUndef) {
3202 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3203 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3204 }
3205
3206 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3207}
3208
3210 bool Signed) const {
3211 // The regular method converting a 64-bit integer to float roughly consists of
3212 // 2 steps: normalization and rounding. In fact, after normalization, the
3213 // conversion from a 64-bit integer to a float is essentially the same as the
3214 // one from a 32-bit integer. The only difference is that it has more
3215 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3216 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3217 // converted into the correct float number. The basic steps for the unsigned
3218 // conversion are illustrated in the following pseudo code:
3219 //
3220 // f32 uitofp(i64 u) {
3221 // i32 hi, lo = split(u);
3222 // // Only count the leading zeros in hi as we have native support of the
3223 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3224 // // reduced to a 32-bit one automatically.
3225 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3226 // u <<= shamt;
3227 // hi, lo = split(u);
3228 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3229 // // convert it as a 32-bit integer and scale the result back.
3230 // return uitofp(hi) * 2^(32 - shamt);
3231 // }
3232 //
3233 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3234 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3235 // converted instead followed by negation based its sign bit.
3236
3237 SDLoc SL(Op);
3238 SDValue Src = Op.getOperand(0);
3239
3240 SDValue Lo, Hi;
3241 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3242 SDValue Sign;
3243 SDValue ShAmt;
3244 if (Signed && Subtarget->isGCN()) {
3245 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3246 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3247 // account. That is, the maximal shift is
3248 // - 32 if Lo and Hi have opposite signs;
3249 // - 33 if Lo and Hi have the same sign.
3250 //
3251 // Or, MaxShAmt = 33 + OppositeSign, where
3252 //
3253 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3254 // - -1 if Lo and Hi have opposite signs; and
3255 // - 0 otherwise.
3256 //
3257 // All in all, ShAmt is calculated as
3258 //
3259 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3260 //
3261 // or
3262 //
3263 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3264 //
3265 // to reduce the critical path.
3266 SDValue OppositeSign = DAG.getNode(
3267 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3268 DAG.getConstant(31, SL, MVT::i32));
3269 SDValue MaxShAmt =
3270 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3271 OppositeSign);
3272 // Count the leading sign bits.
3273 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3274 // Different from unsigned conversion, the shift should be one bit less to
3275 // preserve the sign bit.
3276 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3277 DAG.getConstant(1, SL, MVT::i32));
3278 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3279 } else {
3280 if (Signed) {
3281 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3282 // absolute value first.
3283 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3284 DAG.getConstant(63, SL, MVT::i64));
3285 SDValue Abs =
3286 DAG.getNode(ISD::XOR, SL, MVT::i64,
3287 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3288 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3289 }
3290 // Count the leading zeros.
3291 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3292 // The shift amount for signed integers is [0, 32].
3293 }
3294 // Normalize the given 64-bit integer.
3295 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3296 // Split it again.
3297 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3298 // Calculate the adjust bit for rounding.
3299 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3300 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3301 DAG.getConstant(1, SL, MVT::i32), Lo);
3302 // Get the 32-bit normalized integer.
3303 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3304 // Convert the normalized 32-bit integer into f32.
3305 unsigned Opc =
3306 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3307 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3308
3309 // Finally, need to scale back the converted floating number as the original
3310 // 64-bit integer is converted as a 32-bit one.
3311 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3312 ShAmt);
3313 // On GCN, use LDEXP directly.
3314 if (Subtarget->isGCN())
3315 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3316
3317 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3318 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3319 // exponent is enough to avoid overflowing into the sign bit.
3320 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3321 DAG.getConstant(23, SL, MVT::i32));
3322 SDValue IVal =
3323 DAG.getNode(ISD::ADD, SL, MVT::i32,
3324 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3325 if (Signed) {
3326 // Set the sign bit.
3327 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3328 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3329 DAG.getConstant(31, SL, MVT::i32));
3330 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3331 }
3332 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3333}
3334
3336 bool Signed) const {
3337 SDLoc SL(Op);
3338 SDValue Src = Op.getOperand(0);
3339
3340 SDValue Lo, Hi;
3341 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3342
3344 SL, MVT::f64, Hi);
3345
3346 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3347
3348 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3349 DAG.getConstant(32, SL, MVT::i32));
3350 // TODO: Should this propagate fast-math-flags?
3351 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3352}
3353
3355 SelectionDAG &DAG) const {
3356 // TODO: Factor out code common with LowerSINT_TO_FP.
3357 EVT DestVT = Op.getValueType();
3358 SDValue Src = Op.getOperand(0);
3359 EVT SrcVT = Src.getValueType();
3360
3361 if (SrcVT == MVT::i16) {
3362 if (DestVT == MVT::f16)
3363 return Op;
3364 SDLoc DL(Op);
3365
3366 // Promote src to i32
3367 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3368 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3369 }
3370
3371 if (DestVT == MVT::bf16) {
3372 SDLoc SL(Op);
3373 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3374 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3375 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3376 }
3377
3378 if (SrcVT != MVT::i64)
3379 return Op;
3380
3381 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3382 SDLoc DL(Op);
3383
3384 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3385 SDValue FPRoundFlag =
3386 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3387 SDValue FPRound =
3388 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3389
3390 return FPRound;
3391 }
3392
3393 if (DestVT == MVT::f32)
3394 return LowerINT_TO_FP32(Op, DAG, false);
3395
3396 assert(DestVT == MVT::f64);
3397 return LowerINT_TO_FP64(Op, DAG, false);
3398}
3399
3401 SelectionDAG &DAG) const {
3402 EVT DestVT = Op.getValueType();
3403
3404 SDValue Src = Op.getOperand(0);
3405 EVT SrcVT = Src.getValueType();
3406
3407 if (SrcVT == MVT::i16) {
3408 if (DestVT == MVT::f16)
3409 return Op;
3410
3411 SDLoc DL(Op);
3412 // Promote src to i32
3413 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3414 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3415 }
3416
3417 if (DestVT == MVT::bf16) {
3418 SDLoc SL(Op);
3419 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3420 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3421 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3422 }
3423
3424 if (SrcVT != MVT::i64)
3425 return Op;
3426
3427 // TODO: Factor out code common with LowerUINT_TO_FP.
3428
3429 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3430 SDLoc DL(Op);
3431 SDValue Src = Op.getOperand(0);
3432
3433 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3434 SDValue FPRoundFlag =
3435 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3436 SDValue FPRound =
3437 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3438
3439 return FPRound;
3440 }
3441
3442 if (DestVT == MVT::f32)
3443 return LowerINT_TO_FP32(Op, DAG, true);
3444
3445 assert(DestVT == MVT::f64);
3446 return LowerINT_TO_FP64(Op, DAG, true);
3447}
3448
3450 bool Signed) const {
3451 SDLoc SL(Op);
3452
3453 SDValue Src = Op.getOperand(0);
3454 EVT SrcVT = Src.getValueType();
3455
3456 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3457
3458 // The basic idea of converting a floating point number into a pair of 32-bit
3459 // integers is illustrated as follows:
3460 //
3461 // tf := trunc(val);
3462 // hif := floor(tf * 2^-32);
3463 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3464 // hi := fptoi(hif);
3465 // lo := fptoi(lof);
3466 //
3467 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3468 SDValue Sign;
3469 if (Signed && SrcVT == MVT::f32) {
3470 // However, a 32-bit floating point number has only 23 bits mantissa and
3471 // it's not enough to hold all the significant bits of `lof` if val is
3472 // negative. To avoid the loss of precision, We need to take the absolute
3473 // value after truncating and flip the result back based on the original
3474 // signedness.
3475 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3476 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3477 DAG.getConstant(31, SL, MVT::i32));
3478 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3479 }
3480
3481 SDValue K0, K1;
3482 if (SrcVT == MVT::f64) {
3483 K0 = DAG.getConstantFP(
3484 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3485 SrcVT);
3486 K1 = DAG.getConstantFP(
3487 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3488 SrcVT);
3489 } else {
3490 K0 = DAG.getConstantFP(
3491 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3492 K1 = DAG.getConstantFP(
3493 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3494 }
3495 // TODO: Should this propagate fast-math-flags?
3496 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3497
3498 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3499
3500 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3501
3502 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3504 SL, MVT::i32, FloorMul);
3505 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3506
3507 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3508 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3509
3510 if (Signed && SrcVT == MVT::f32) {
3511 assert(Sign);
3512 // Flip the result based on the signedness, which is either all 0s or 1s.
3513 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3514 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3515 // r := xor(r, sign) - sign;
3516 Result =
3517 DAG.getNode(ISD::SUB, SL, MVT::i64,
3518 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3519 }
3520
3521 return Result;
3522}
3523
3525 SDLoc DL(Op);
3526 SDValue N0 = Op.getOperand(0);
3527
3528 // Convert to target node to get known bits
3529 if (N0.getValueType() == MVT::f32)
3530 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3531
3532 if (getTargetMachine().Options.UnsafeFPMath) {
3533 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3534 return SDValue();
3535 }
3536
3537 assert(N0.getSimpleValueType() == MVT::f64);
3538
3539 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3540 const unsigned ExpMask = 0x7ff;
3541 const unsigned ExpBiasf64 = 1023;
3542 const unsigned ExpBiasf16 = 15;
3543 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3544 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3545 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3546 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3547 DAG.getConstant(32, DL, MVT::i64));
3548 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3549 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3550 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3551 DAG.getConstant(20, DL, MVT::i64));
3552 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3553 DAG.getConstant(ExpMask, DL, MVT::i32));
3554 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3555 // add the f16 bias (15) to get the biased exponent for the f16 format.
3556 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3557 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3558
3559 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3560 DAG.getConstant(8, DL, MVT::i32));
3561 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3562 DAG.getConstant(0xffe, DL, MVT::i32));
3563
3564 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3565 DAG.getConstant(0x1ff, DL, MVT::i32));
3566 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3567
3568 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3569 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3570
3571 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3572 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3573 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3574 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3575
3576 // N = M | (E << 12);
3577 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3578 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3579 DAG.getConstant(12, DL, MVT::i32)));
3580
3581 // B = clamp(1-E, 0, 13);
3582 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3583 One, E);
3584 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3585 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3586 DAG.getConstant(13, DL, MVT::i32));
3587
3588 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3589 DAG.getConstant(0x1000, DL, MVT::i32));
3590
3591 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3592 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3593 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3594 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3595
3596 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3597 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3598 DAG.getConstant(0x7, DL, MVT::i32));
3599 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3600 DAG.getConstant(2, DL, MVT::i32));
3601 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3602 One, Zero, ISD::SETEQ);
3603 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3604 One, Zero, ISD::SETGT);
3605 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3606 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3607
3608 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3609 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3610 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3611 I, V, ISD::SETEQ);
3612
3613 // Extract the sign bit.
3614 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3615 DAG.getConstant(16, DL, MVT::i32));
3616 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3617 DAG.getConstant(0x8000, DL, MVT::i32));
3618
3619 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3620 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3621}
3622
3624 SelectionDAG &DAG) const {
3625 SDValue Src = Op.getOperand(0);
3626 unsigned OpOpcode = Op.getOpcode();
3627 EVT SrcVT = Src.getValueType();
3628 EVT DestVT = Op.getValueType();
3629
3630 // Will be selected natively
3631 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3632 return Op;
3633
3634 if (SrcVT == MVT::bf16) {
3635 SDLoc DL(Op);
3636 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3637 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3638 }
3639
3640 // Promote i16 to i32
3641 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3642 SDLoc DL(Op);
3643
3644 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3645 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3646 }
3647
3648 if (DestVT != MVT::i64)
3649 return Op;
3650
3651 if (SrcVT == MVT::f16 ||
3652 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3653 SDLoc DL(Op);
3654
3655 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3656 unsigned Ext =
3658 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3659 }
3660
3661 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3662 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3663
3664 return SDValue();
3665}
3666
3668 SelectionDAG &DAG) const {
3669 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3670 MVT VT = Op.getSimpleValueType();
3671 MVT ScalarVT = VT.getScalarType();
3672
3673 assert(VT.isVector());
3674
3675 SDValue Src = Op.getOperand(0);
3676 SDLoc DL(Op);
3677
3678 // TODO: Don't scalarize on Evergreen?
3679 unsigned NElts = VT.getVectorNumElements();
3681 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3682
3683 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3684 for (unsigned I = 0; I < NElts; ++I)
3685 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3686
3687 return DAG.getBuildVector(VT, DL, Args);
3688}
3689
3690//===----------------------------------------------------------------------===//
3691// Custom DAG optimizations
3692//===----------------------------------------------------------------------===//
3693
3694static bool isU24(SDValue Op, SelectionDAG &DAG) {
3695 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3696}
3697
3698static bool isI24(SDValue Op, SelectionDAG &DAG) {
3699 EVT VT = Op.getValueType();
3700 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3701 // as unsigned 24-bit values.
3703}
3704
3707 SelectionDAG &DAG = DCI.DAG;
3708 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3709 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3710
3711 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3712 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3713 unsigned NewOpcode = Node24->getOpcode();
3714 if (IsIntrin) {
3715 unsigned IID = Node24->getConstantOperandVal(0);
3716 switch (IID) {
3717 case Intrinsic::amdgcn_mul_i24:
3718 NewOpcode = AMDGPUISD::MUL_I24;
3719 break;
3720 case Intrinsic::amdgcn_mul_u24:
3721 NewOpcode = AMDGPUISD::MUL_U24;
3722 break;
3723 case Intrinsic::amdgcn_mulhi_i24:
3724 NewOpcode = AMDGPUISD::MULHI_I24;
3725 break;
3726 case Intrinsic::amdgcn_mulhi_u24:
3727 NewOpcode = AMDGPUISD::MULHI_U24;
3728 break;
3729 default:
3730 llvm_unreachable("Expected 24-bit mul intrinsic");
3731 }
3732 }
3733
3734 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3735
3736 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3737 // the operands to have other uses, but will only perform simplifications that
3738 // involve bypassing some nodes for this user.
3739 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3740 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3741 if (DemandedLHS || DemandedRHS)
3742 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3743 DemandedLHS ? DemandedLHS : LHS,
3744 DemandedRHS ? DemandedRHS : RHS);
3745
3746 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3747 // operands if this node is the only user.
3748 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3749 return SDValue(Node24, 0);
3750 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3751 return SDValue(Node24, 0);
3752
3753 return SDValue();
3754}
3755
3756template <typename IntTy>
3758 uint32_t Width, const SDLoc &DL) {
3759 if (Width + Offset < 32) {
3760 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3761 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3762 return DAG.getConstant(Result, DL, MVT::i32);
3763 }
3764
3765 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3766}
3767
3768static bool hasVolatileUser(SDNode *Val) {
3769 for (SDNode *U : Val->uses()) {
3770 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3771 if (M->isVolatile())
3772 return true;
3773 }
3774 }
3775
3776 return false;
3777}
3778
3780 // i32 vectors are the canonical memory type.
3781 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3782 return false;
3783
3784 if (!VT.isByteSized())
3785 return false;
3786
3787 unsigned Size = VT.getStoreSize();
3788
3789 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3790 return false;
3791
3792 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3793 return false;
3794
3795 return true;
3796}
3797
3798// Replace load of an illegal type with a store of a bitcast to a friendlier
3799// type.
3801 DAGCombinerInfo &DCI) const {
3802 if (!DCI.isBeforeLegalize())
3803 return SDValue();
3804
3805 LoadSDNode *LN = cast<LoadSDNode>(N);
3806 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3807 return SDValue();
3808
3809 SDLoc SL(N);
3810 SelectionDAG &DAG = DCI.DAG;
3811 EVT VT = LN->getMemoryVT();
3812
3813 unsigned Size = VT.getStoreSize();
3814 Align Alignment = LN->getAlign();
3815 if (Alignment < Size && isTypeLegal(VT)) {
3816 unsigned IsFast;
3817 unsigned AS = LN->getAddressSpace();
3818
3819 // Expand unaligned loads earlier than legalization. Due to visitation order
3820 // problems during legalization, the emitted instructions to pack and unpack
3821 // the bytes again are not eliminated in the case of an unaligned copy.
3823 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3824 if (VT.isVector())
3825 return SplitVectorLoad(SDValue(LN, 0), DAG);
3826
3827 SDValue Ops[2];
3828 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3829
3830 return DAG.getMergeValues(Ops, SDLoc(N));
3831 }
3832
3833 if (!IsFast)
3834 return SDValue();
3835 }
3836
3837 if (!shouldCombineMemoryType(VT))
3838 return SDValue();
3839
3840 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3841
3842 SDValue NewLoad
3843 = DAG.getLoad(NewVT, SL, LN->getChain(),
3844 LN->getBasePtr(), LN->getMemOperand());
3845
3846 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3847 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3848 return SDValue(N, 0);
3849}
3850
3851// Replace store of an illegal type with a store of a bitcast to a friendlier
3852// type.
3854 DAGCombinerInfo &DCI) const {
3855 if (!DCI.isBeforeLegalize())
3856 return SDValue();
3857
3858 StoreSDNode *SN = cast<StoreSDNode>(N);
3859 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3860 return SDValue();
3861
3862 EVT VT = SN->getMemoryVT();
3863 unsigned Size = VT.getStoreSize();
3864
3865 SDLoc SL(N);
3866 SelectionDAG &DAG = DCI.DAG;
3867 Align Alignment = SN->getAlign();