LLVM 20.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382
383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
396 MVT::f32, Legal);
397
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
401 {MVT::f16, MVT::f32, MVT::f64}, Expand);
402
405 Custom);
406
407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412 Expand);
413
414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
415
416 if (Subtarget->has16BitInsts())
417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
418 else {
419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
421 }
422
424 Custom);
425
426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428 // default unless marked custom/legal.
431 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
432 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
433 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
434 Custom);
435
436 // Expand to fneg + fadd.
438
440 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
441 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
449 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
450 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
451 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
452 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
453 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
454 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
455 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
456 Custom);
457
459 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
460
461 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
462 for (MVT VT : ScalarIntVTs) {
463 // These should use [SU]DIVREM, so set them to expand
465 Expand);
466
467 // GPU does not have divrem function for signed or unsigned.
469
470 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
472
474
475 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
477 }
478
479 // The hardware supports 32-bit FSHR, but not FSHL.
481
482 // The hardware supports 32-bit ROTR, but not ROTL.
483 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
485
487
491 MVT::i64, Custom);
493
495 Legal);
496
499 MVT::i64, Custom);
500
501 for (auto VT : {MVT::i8, MVT::i16})
503
504 static const MVT::SimpleValueType VectorIntTypes[] = {
505 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
506 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
507
508 for (MVT VT : VectorIntTypes) {
509 // Expand the following operations for the current type by default.
521 ISD::SETCC},
522 VT, Expand);
523 }
524
525 static const MVT::SimpleValueType FloatVectorTypes[] = {
526 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
527 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
528
529 for (MVT VT : FloatVectorTypes) {
542 VT, Expand);
543 }
544
545 // This causes using an unrolled select operation rather than expansion with
546 // bit operations. This is in general better, but the alternative using BFI
547 // instructions may be better if the select sources are SGPRs.
549 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
550
552 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
553
555 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
556
558 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
559
561 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
562
564 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
565
567 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
568
570 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
571
573 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
574
576 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
577
579 setJumpIsExpensive(true);
580
581 // FIXME: This is only partially true. If we have to do vector compares, any
582 // SGPR pair can be a condition register. If we have a uniform condition, we
583 // are better off doing SALU operations, where there is only one SCC. For now,
584 // we don't have a way of knowing during instruction selection if a condition
585 // will be uniform and we always use vector compares. Assume we are using
586 // vector compares until that is fixed.
588
591
593
594 // We want to find all load dependencies for long chains of stores to enable
595 // merging into very wide vectors. The problem is with vectors with > 4
596 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
597 // vectors are a legal type, even though we have to split the loads
598 // usually. When we can more precisely specify load legality per address
599 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
600 // smarter so that they can figure out what to do in 2 iterations without all
601 // N > 4 stores on the same chain.
603
604 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
605 // about these during lowering.
606 MaxStoresPerMemcpy = 0xffffffff;
607 MaxStoresPerMemmove = 0xffffffff;
608 MaxStoresPerMemset = 0xffffffff;
609
610 // The expansion for 64-bit division is enormous.
612 addBypassSlowDiv(64, 32);
613
624
628}
629
631 if (getTargetMachine().Options.NoSignedZerosFPMath)
632 return true;
633
634 const auto Flags = Op.getNode()->getFlags();
635 if (Flags.hasNoSignedZeros())
636 return true;
637
638 return false;
639}
640
641//===----------------------------------------------------------------------===//
642// Target Information
643//===----------------------------------------------------------------------===//
644
646static bool fnegFoldsIntoOpcode(unsigned Opc) {
647 switch (Opc) {
648 case ISD::FADD:
649 case ISD::FSUB:
650 case ISD::FMUL:
651 case ISD::FMA:
652 case ISD::FMAD:
653 case ISD::FMINNUM:
654 case ISD::FMAXNUM:
657 case ISD::FMINIMUM:
658 case ISD::FMAXIMUM:
659 case ISD::SELECT:
660 case ISD::FSIN:
661 case ISD::FTRUNC:
662 case ISD::FRINT:
663 case ISD::FNEARBYINT:
664 case ISD::FROUNDEVEN:
666 case AMDGPUISD::RCP:
673 case AMDGPUISD::FMED3:
674 // TODO: handle llvm.amdgcn.fma.legacy
675 return true;
676 case ISD::BITCAST:
677 llvm_unreachable("bitcast is special cased");
678 default:
679 return false;
680 }
681}
682
683static bool fnegFoldsIntoOp(const SDNode *N) {
684 unsigned Opc = N->getOpcode();
685 if (Opc == ISD::BITCAST) {
686 // TODO: Is there a benefit to checking the conditions performFNegCombine
687 // does? We don't for the other cases.
688 SDValue BCSrc = N->getOperand(0);
689 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
690 return BCSrc.getNumOperands() == 2 &&
691 BCSrc.getOperand(1).getValueSizeInBits() == 32;
692 }
693
694 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
695 }
696
697 return fnegFoldsIntoOpcode(Opc);
698}
699
700/// \p returns true if the operation will definitely need to use a 64-bit
701/// encoding, and thus will use a VOP3 encoding regardless of the source
702/// modifiers.
704static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
705 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
706 VT == MVT::f64;
707}
708
709/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
710/// type for ISD::SELECT.
712static bool selectSupportsSourceMods(const SDNode *N) {
713 // TODO: Only applies if select will be vector
714 return N->getValueType(0) == MVT::f32;
715}
716
717// Most FP instructions support source modifiers, but this could be refined
718// slightly.
720static bool hasSourceMods(const SDNode *N) {
721 if (isa<MemSDNode>(N))
722 return false;
723
724 switch (N->getOpcode()) {
725 case ISD::CopyToReg:
726 case ISD::FDIV:
727 case ISD::FREM:
728 case ISD::INLINEASM:
732
733 // TODO: Should really be looking at the users of the bitcast. These are
734 // problematic because bitcasts are used to legalize all stores to integer
735 // types.
736 case ISD::BITCAST:
737 return false;
739 switch (N->getConstantOperandVal(0)) {
740 case Intrinsic::amdgcn_interp_p1:
741 case Intrinsic::amdgcn_interp_p2:
742 case Intrinsic::amdgcn_interp_mov:
743 case Intrinsic::amdgcn_interp_p1_f16:
744 case Intrinsic::amdgcn_interp_p2_f16:
745 return false;
746 default:
747 return true;
748 }
749 }
750 case ISD::SELECT:
752 default:
753 return true;
754 }
755}
756
758 unsigned CostThreshold) {
759 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
760 // it is truly free to use a source modifier in all cases. If there are
761 // multiple users but for each one will necessitate using VOP3, there will be
762 // a code size increase. Try to avoid increasing code size unless we know it
763 // will save on the instruction count.
764 unsigned NumMayIncreaseSize = 0;
765 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
766
767 assert(!N->use_empty());
768
769 // XXX - Should this limit number of uses to check?
770 for (const SDNode *U : N->uses()) {
771 if (!hasSourceMods(U))
772 return false;
773
774 if (!opMustUseVOP3Encoding(U, VT)) {
775 if (++NumMayIncreaseSize > CostThreshold)
776 return false;
777 }
778 }
779
780 return true;
781}
782
784 ISD::NodeType ExtendKind) const {
785 assert(!VT.isVector() && "only scalar expected");
786
787 // Round to the next multiple of 32-bits.
788 unsigned Size = VT.getSizeInBits();
789 if (Size <= 32)
790 return MVT::i32;
791 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
792}
793
795 return MVT::i32;
796}
797
799 return true;
800}
801
802// The backend supports 32 and 64 bit floating point immediates.
803// FIXME: Why are we reporting vectors of FP immediates as legal?
805 bool ForCodeSize) const {
806 EVT ScalarVT = VT.getScalarType();
807 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
808 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
809}
810
811// We don't want to shrink f64 / f32 constants.
813 EVT ScalarVT = VT.getScalarType();
814 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
815}
816
818 ISD::LoadExtType ExtTy,
819 EVT NewVT) const {
820 // TODO: This may be worth removing. Check regression tests for diffs.
822 return false;
823
824 unsigned NewSize = NewVT.getStoreSizeInBits();
825
826 // If we are reducing to a 32-bit load or a smaller multi-dword load,
827 // this is always better.
828 if (NewSize >= 32)
829 return true;
830
831 EVT OldVT = N->getValueType(0);
832 unsigned OldSize = OldVT.getStoreSizeInBits();
833
834 MemSDNode *MN = cast<MemSDNode>(N);
835 unsigned AS = MN->getAddressSpace();
836 // Do not shrink an aligned scalar load to sub-dword.
837 // Scalar engine cannot do sub-dword loads.
838 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
839 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
842 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
843 MN->isInvariant())) &&
845 return false;
846
847 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
848 // extloads, so doing one requires using a buffer_load. In cases where we
849 // still couldn't use a scalar load, using the wider load shouldn't really
850 // hurt anything.
851
852 // If the old size already had to be an extload, there's no harm in continuing
853 // to reduce the width.
854 return (OldSize < 32);
855}
856
858 const SelectionDAG &DAG,
859 const MachineMemOperand &MMO) const {
860
861 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
862
863 if (LoadTy.getScalarType() == MVT::i32)
864 return false;
865
866 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
867 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
868
869 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
870 return false;
871
872 unsigned Fast = 0;
874 CastTy, MMO, &Fast) &&
875 Fast;
876}
877
878// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
879// profitable with the expansion for 64-bit since it's generally good to
880// speculate things.
882 return true;
883}
884
886 return true;
887}
888
890 switch (N->getOpcode()) {
891 case ISD::EntryToken:
892 case ISD::TokenFactor:
893 return true;
895 unsigned IntrID = N->getConstantOperandVal(0);
897 }
898 case ISD::LOAD:
899 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
901 return true;
902 return false;
903 case AMDGPUISD::SETCC: // ballot-style instruction
904 return true;
905 }
906 return false;
907}
908
910 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
911 NegatibleCost &Cost, unsigned Depth) const {
912
913 switch (Op.getOpcode()) {
914 case ISD::FMA:
915 case ISD::FMAD: {
916 // Negating a fma is not free if it has users without source mods.
917 if (!allUsesHaveSourceMods(Op.getNode()))
918 return SDValue();
919 break;
920 }
921 case AMDGPUISD::RCP: {
922 SDValue Src = Op.getOperand(0);
923 EVT VT = Op.getValueType();
924 SDLoc SL(Op);
925
926 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
927 ForCodeSize, Cost, Depth + 1);
928 if (NegSrc)
929 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
930 return SDValue();
931 }
932 default:
933 break;
934 }
935
936 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
937 ForCodeSize, Cost, Depth);
938}
939
940//===---------------------------------------------------------------------===//
941// Target Properties
942//===---------------------------------------------------------------------===//
943
946
947 // Packed operations do not have a fabs modifier.
948 return VT == MVT::f32 || VT == MVT::f64 ||
949 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
950}
951
954 // Report this based on the end legalized type.
955 VT = VT.getScalarType();
956 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
957}
958
960 unsigned NumElem,
961 unsigned AS) const {
962 return true;
963}
964
966 // There are few operations which truly have vector input operands. Any vector
967 // operation is going to involve operations on each component, and a
968 // build_vector will be a copy per element, so it always makes sense to use a
969 // build_vector input in place of the extracted element to avoid a copy into a
970 // super register.
971 //
972 // We should probably only do this if all users are extracts only, but this
973 // should be the common case.
974 return true;
975}
976
978 // Truncate is just accessing a subregister.
979
980 unsigned SrcSize = Source.getSizeInBits();
981 unsigned DestSize = Dest.getSizeInBits();
982
983 return DestSize < SrcSize && DestSize % 32 == 0 ;
984}
985
987 // Truncate is just accessing a subregister.
988
989 unsigned SrcSize = Source->getScalarSizeInBits();
990 unsigned DestSize = Dest->getScalarSizeInBits();
991
992 if (DestSize== 16 && Subtarget->has16BitInsts())
993 return SrcSize >= 32;
994
995 return DestSize < SrcSize && DestSize % 32 == 0;
996}
997
999 unsigned SrcSize = Src->getScalarSizeInBits();
1000 unsigned DestSize = Dest->getScalarSizeInBits();
1001
1002 if (SrcSize == 16 && Subtarget->has16BitInsts())
1003 return DestSize >= 32;
1004
1005 return SrcSize == 32 && DestSize == 64;
1006}
1007
1009 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1010 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1011 // this will enable reducing 64-bit operations the 32-bit, which is always
1012 // good.
1013
1014 if (Src == MVT::i16)
1015 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1016
1017 return Src == MVT::i32 && Dest == MVT::i64;
1018}
1019
1021 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1022 // limited number of native 64-bit operations. Shrinking an operation to fit
1023 // in a single 32-bit register should always be helpful. As currently used,
1024 // this is much less general than the name suggests, and is only used in
1025 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1026 // not profitable, and may actually be harmful.
1027 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1028}
1029
1031 const SDNode* N, CombineLevel Level) const {
1032 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1033 N->getOpcode() == ISD::SRL) &&
1034 "Expected shift op");
1035 // Always commute pre-type legalization and right shifts.
1036 // We're looking for shl(or(x,y),z) patterns.
1038 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1039 return true;
1040
1041 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1042 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1043 (N->use_begin()->getOpcode() == ISD::SRA ||
1044 N->use_begin()->getOpcode() == ISD::SRL))
1045 return false;
1046
1047 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1048 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1049 if (LHS.getOpcode() != ISD::SHL)
1050 return false;
1051 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1052 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1053 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1054 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1055 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1056 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1057 };
1058 SDValue LHS = N->getOperand(0).getOperand(0);
1059 SDValue RHS = N->getOperand(0).getOperand(1);
1060 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1061}
1062
1063//===---------------------------------------------------------------------===//
1064// TargetLowering Callbacks
1065//===---------------------------------------------------------------------===//
1066
1068 bool IsVarArg) {
1069 switch (CC) {
1077 return CC_AMDGPU;
1080 return CC_AMDGPU_CS_CHAIN;
1081 case CallingConv::C:
1082 case CallingConv::Fast:
1083 case CallingConv::Cold:
1084 return CC_AMDGPU_Func;
1086 return CC_SI_Gfx;
1089 default:
1090 report_fatal_error("Unsupported calling convention for call");
1091 }
1092}
1093
1095 bool IsVarArg) {
1096 switch (CC) {
1099 llvm_unreachable("kernels should not be handled here");
1109 return RetCC_SI_Shader;
1111 return RetCC_SI_Gfx;
1112 case CallingConv::C:
1113 case CallingConv::Fast:
1114 case CallingConv::Cold:
1115 return RetCC_AMDGPU_Func;
1116 default:
1117 report_fatal_error("Unsupported calling convention.");
1118 }
1119}
1120
1121/// The SelectionDAGBuilder will automatically promote function arguments
1122/// with illegal types. However, this does not work for the AMDGPU targets
1123/// since the function arguments are stored in memory as these illegal types.
1124/// In order to handle this properly we need to get the original types sizes
1125/// from the LLVM IR Function and fixup the ISD:InputArg values before
1126/// passing them to AnalyzeFormalArguments()
1127
1128/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1129/// input values across multiple registers. Each item in the Ins array
1130/// represents a single value that will be stored in registers. Ins[x].VT is
1131/// the value type of the value that will be stored in the register, so
1132/// whatever SDNode we lower the argument to needs to be this type.
1133///
1134/// In order to correctly lower the arguments we need to know the size of each
1135/// argument. Since Ins[x].VT gives us the size of the register that will
1136/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1137/// for the original function argument so that we can deduce the correct memory
1138/// type to use for Ins[x]. In most cases the correct memory type will be
1139/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1140/// we have a kernel argument of type v8i8, this argument will be split into
1141/// 8 parts and each part will be represented by its own item in the Ins array.
1142/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1143/// the argument before it was split. From this, we deduce that the memory type
1144/// for each individual part is i8. We pass the memory type as LocVT to the
1145/// calling convention analysis function and the register type (Ins[x].VT) as
1146/// the ValVT.
1148 CCState &State,
1149 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1150 const MachineFunction &MF = State.getMachineFunction();
1151 const Function &Fn = MF.getFunction();
1152 LLVMContext &Ctx = Fn.getParent()->getContext();
1153 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1154 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1156
1157 Align MaxAlign = Align(1);
1158 uint64_t ExplicitArgOffset = 0;
1159 const DataLayout &DL = Fn.getDataLayout();
1160
1161 unsigned InIndex = 0;
1162
1163 for (const Argument &Arg : Fn.args()) {
1164 const bool IsByRef = Arg.hasByRefAttr();
1165 Type *BaseArgTy = Arg.getType();
1166 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1167 Align Alignment = DL.getValueOrABITypeAlignment(
1168 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1169 MaxAlign = std::max(Alignment, MaxAlign);
1170 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1171
1172 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1173 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1174
1175 // We're basically throwing away everything passed into us and starting over
1176 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1177 // to us as computed in Ins.
1178 //
1179 // We also need to figure out what type legalization is trying to do to get
1180 // the correct memory offsets.
1181
1182 SmallVector<EVT, 16> ValueVTs;
1184 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1185
1186 for (unsigned Value = 0, NumValues = ValueVTs.size();
1187 Value != NumValues; ++Value) {
1188 uint64_t BasePartOffset = Offsets[Value];
1189
1190 EVT ArgVT = ValueVTs[Value];
1191 EVT MemVT = ArgVT;
1192 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1193 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1194
1195 if (NumRegs == 1) {
1196 // This argument is not split, so the IR type is the memory type.
1197 if (ArgVT.isExtended()) {
1198 // We have an extended type, like i24, so we should just use the
1199 // register type.
1200 MemVT = RegisterVT;
1201 } else {
1202 MemVT = ArgVT;
1203 }
1204 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1205 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1206 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1207 // We have a vector value which has been split into a vector with
1208 // the same scalar type, but fewer elements. This should handle
1209 // all the floating-point vector types.
1210 MemVT = RegisterVT;
1211 } else if (ArgVT.isVector() &&
1212 ArgVT.getVectorNumElements() == NumRegs) {
1213 // This arg has been split so that each element is stored in a separate
1214 // register.
1215 MemVT = ArgVT.getScalarType();
1216 } else if (ArgVT.isExtended()) {
1217 // We have an extended type, like i65.
1218 MemVT = RegisterVT;
1219 } else {
1220 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1221 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1222 if (RegisterVT.isInteger()) {
1223 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1224 } else if (RegisterVT.isVector()) {
1225 assert(!RegisterVT.getScalarType().isFloatingPoint());
1226 unsigned NumElements = RegisterVT.getVectorNumElements();
1227 assert(MemoryBits % NumElements == 0);
1228 // This vector type has been split into another vector type with
1229 // a different elements size.
1230 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1231 MemoryBits / NumElements);
1232 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1233 } else {
1234 llvm_unreachable("cannot deduce memory type.");
1235 }
1236 }
1237
1238 // Convert one element vectors to scalar.
1239 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1240 MemVT = MemVT.getScalarType();
1241
1242 // Round up vec3/vec5 argument.
1243 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1244 MemVT = MemVT.getPow2VectorType(State.getContext());
1245 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1246 MemVT = MemVT.getRoundIntegerType(State.getContext());
1247 }
1248
1249 unsigned PartOffset = 0;
1250 for (unsigned i = 0; i != NumRegs; ++i) {
1251 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1252 BasePartOffset + PartOffset,
1253 MemVT.getSimpleVT(),
1255 PartOffset += MemVT.getStoreSize();
1256 }
1257 }
1258 }
1259}
1260
1262 SDValue Chain, CallingConv::ID CallConv,
1263 bool isVarArg,
1265 const SmallVectorImpl<SDValue> &OutVals,
1266 const SDLoc &DL, SelectionDAG &DAG) const {
1267 // FIXME: Fails for r600 tests
1268 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1269 // "wave terminate should not have return values");
1270 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1271}
1272
1273//===---------------------------------------------------------------------===//
1274// Target specific lowering
1275//===---------------------------------------------------------------------===//
1276
1277/// Selects the correct CCAssignFn for a given CallingConvention value.
1279 bool IsVarArg) {
1280 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1281}
1282
1284 bool IsVarArg) {
1286}
1287
1289 SelectionDAG &DAG,
1290 MachineFrameInfo &MFI,
1291 int ClobberedFI) const {
1292 SmallVector<SDValue, 8> ArgChains;
1293 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1294 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1295
1296 // Include the original chain at the beginning of the list. When this is
1297 // used by target LowerCall hooks, this helps legalize find the
1298 // CALLSEQ_BEGIN node.
1299 ArgChains.push_back(Chain);
1300
1301 // Add a chain value for each stack argument corresponding
1302 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1303 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1304 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1305 if (FI->getIndex() < 0) {
1306 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1307 int64_t InLastByte = InFirstByte;
1308 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1309
1310 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1311 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1312 ArgChains.push_back(SDValue(L, 1));
1313 }
1314 }
1315 }
1316 }
1317
1318 // Build a tokenfactor for all the chains.
1319 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1320}
1321
1324 StringRef Reason) const {
1325 SDValue Callee = CLI.Callee;
1326 SelectionDAG &DAG = CLI.DAG;
1327
1328 const Function &Fn = DAG.getMachineFunction().getFunction();
1329
1330 StringRef FuncName("<unknown>");
1331
1332 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1333 FuncName = G->getSymbol();
1334 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1335 FuncName = G->getGlobal()->getName();
1336
1338 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1339 DAG.getContext()->diagnose(NoCalls);
1340
1341 if (!CLI.IsTailCall) {
1342 for (ISD::InputArg &Arg : CLI.Ins)
1343 InVals.push_back(DAG.getUNDEF(Arg.VT));
1344 }
1345
1346 return DAG.getEntryNode();
1347}
1348
1350 SmallVectorImpl<SDValue> &InVals) const {
1351 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1352}
1353
1355 SelectionDAG &DAG) const {
1356 const Function &Fn = DAG.getMachineFunction().getFunction();
1357
1358 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1359 SDLoc(Op).getDebugLoc());
1360 DAG.getContext()->diagnose(NoDynamicAlloca);
1361 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1362 return DAG.getMergeValues(Ops, SDLoc());
1363}
1364
1366 SelectionDAG &DAG) const {
1367 switch (Op.getOpcode()) {
1368 default:
1369 Op->print(errs(), &DAG);
1370 llvm_unreachable("Custom lowering code for this "
1371 "instruction is not implemented yet!");
1372 break;
1374 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1376 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1377 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1378 case ISD::FREM: return LowerFREM(Op, DAG);
1379 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1380 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1381 case ISD::FRINT: return LowerFRINT(Op, DAG);
1382 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1383 case ISD::FROUNDEVEN:
1384 return LowerFROUNDEVEN(Op, DAG);
1385 case ISD::FROUND: return LowerFROUND(Op, DAG);
1386 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1387 case ISD::FLOG2:
1388 return LowerFLOG2(Op, DAG);
1389 case ISD::FLOG:
1390 case ISD::FLOG10:
1391 return LowerFLOGCommon(Op, DAG);
1392 case ISD::FEXP:
1393 case ISD::FEXP10:
1394 return lowerFEXP(Op, DAG);
1395 case ISD::FEXP2:
1396 return lowerFEXP2(Op, DAG);
1397 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1398 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1399 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1400 case ISD::FP_TO_SINT:
1401 case ISD::FP_TO_UINT:
1402 return LowerFP_TO_INT(Op, DAG);
1403 case ISD::CTTZ:
1405 case ISD::CTLZ:
1407 return LowerCTLZ_CTTZ(Op, DAG);
1409 }
1410 return Op;
1411}
1412
1415 SelectionDAG &DAG) const {
1416 switch (N->getOpcode()) {
1418 // Different parts of legalization seem to interpret which type of
1419 // sign_extend_inreg is the one to check for custom lowering. The extended
1420 // from type is what really matters, but some places check for custom
1421 // lowering of the result type. This results in trying to use
1422 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1423 // nothing here and let the illegal result integer be handled normally.
1424 return;
1425 case ISD::FLOG2:
1426 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1427 Results.push_back(Lowered);
1428 return;
1429 case ISD::FLOG:
1430 case ISD::FLOG10:
1431 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1432 Results.push_back(Lowered);
1433 return;
1434 case ISD::FEXP2:
1435 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1436 Results.push_back(Lowered);
1437 return;
1438 case ISD::FEXP:
1439 case ISD::FEXP10:
1440 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1441 Results.push_back(Lowered);
1442 return;
1443 case ISD::CTLZ:
1445 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1446 Results.push_back(Lowered);
1447 return;
1448 default:
1449 return;
1450 }
1451}
1452
1454 SDValue Op,
1455 SelectionDAG &DAG) const {
1456
1457 const DataLayout &DL = DAG.getDataLayout();
1458 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1459 const GlobalValue *GV = G->getGlobal();
1460
1461 if (!MFI->isModuleEntryFunction()) {
1462 if (std::optional<uint32_t> Address =
1464 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1465 }
1466 }
1467
1468 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1469 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1470 if (!MFI->isModuleEntryFunction() &&
1471 GV->getName() != "llvm.amdgcn.module.lds") {
1472 SDLoc DL(Op);
1473 const Function &Fn = DAG.getMachineFunction().getFunction();
1474 DiagnosticInfoUnsupported BadLDSDecl(
1475 Fn, "local memory global used by non-kernel function",
1476 DL.getDebugLoc(), DS_Warning);
1477 DAG.getContext()->diagnose(BadLDSDecl);
1478
1479 // We currently don't have a way to correctly allocate LDS objects that
1480 // aren't directly associated with a kernel. We do force inlining of
1481 // functions that use local objects. However, if these dead functions are
1482 // not eliminated, we don't want a compile time error. Just emit a warning
1483 // and a trap, since there should be no callable path here.
1484 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1485 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1486 Trap, DAG.getRoot());
1487 DAG.setRoot(OutputChain);
1488 return DAG.getUNDEF(Op.getValueType());
1489 }
1490
1491 // XXX: What does the value of G->getOffset() mean?
1492 assert(G->getOffset() == 0 &&
1493 "Do not know what to do with an non-zero offset");
1494
1495 // TODO: We could emit code to handle the initialization somewhere.
1496 // We ignore the initializer for now and legalize it to allow selection.
1497 // The initializer will anyway get errored out during assembly emission.
1498 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1499 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1500 }
1501 return SDValue();
1502}
1503
1505 SelectionDAG &DAG) const {
1507 SDLoc SL(Op);
1508
1509 EVT VT = Op.getValueType();
1510 if (VT.getVectorElementType().getSizeInBits() < 32) {
1511 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1512 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1513 unsigned NewNumElt = OpBitSize / 32;
1514 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1516 MVT::i32, NewNumElt);
1517 for (const SDUse &U : Op->ops()) {
1518 SDValue In = U.get();
1519 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1520 if (NewNumElt > 1)
1521 DAG.ExtractVectorElements(NewIn, Args);
1522 else
1523 Args.push_back(NewIn);
1524 }
1525
1526 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1527 NewNumElt * Op.getNumOperands());
1528 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1529 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1530 }
1531 }
1532
1533 for (const SDUse &U : Op->ops())
1534 DAG.ExtractVectorElements(U.get(), Args);
1535
1536 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1537}
1538
1540 SelectionDAG &DAG) const {
1541 SDLoc SL(Op);
1543 unsigned Start = Op.getConstantOperandVal(1);
1544 EVT VT = Op.getValueType();
1545 EVT SrcVT = Op.getOperand(0).getValueType();
1546
1547 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1548 unsigned NumElt = VT.getVectorNumElements();
1549 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1550 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1551
1552 // Extract 32-bit registers at a time.
1553 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1554 EVT NewVT = NumElt == 2
1555 ? MVT::i32
1556 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1557 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1558
1559 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1560 if (NumElt == 2)
1561 Tmp = Args[0];
1562 else
1563 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1564
1565 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1566 }
1567
1568 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1570
1571 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1572}
1573
1574// TODO: Handle fabs too
1576 if (Val.getOpcode() == ISD::FNEG)
1577 return Val.getOperand(0);
1578
1579 return Val;
1580}
1581
1583 if (Val.getOpcode() == ISD::FNEG)
1584 Val = Val.getOperand(0);
1585 if (Val.getOpcode() == ISD::FABS)
1586 Val = Val.getOperand(0);
1587 if (Val.getOpcode() == ISD::FCOPYSIGN)
1588 Val = Val.getOperand(0);
1589 return Val;
1590}
1591
1593 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1594 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1595 SelectionDAG &DAG = DCI.DAG;
1596 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1597 switch (CCOpcode) {
1598 case ISD::SETOEQ:
1599 case ISD::SETONE:
1600 case ISD::SETUNE:
1601 case ISD::SETNE:
1602 case ISD::SETUEQ:
1603 case ISD::SETEQ:
1604 case ISD::SETFALSE:
1605 case ISD::SETFALSE2:
1606 case ISD::SETTRUE:
1607 case ISD::SETTRUE2:
1608 case ISD::SETUO:
1609 case ISD::SETO:
1610 break;
1611 case ISD::SETULE:
1612 case ISD::SETULT: {
1613 if (LHS == True)
1614 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1615 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1616 }
1617 case ISD::SETOLE:
1618 case ISD::SETOLT:
1619 case ISD::SETLE:
1620 case ISD::SETLT: {
1621 // Ordered. Assume ordered for undefined.
1622
1623 // Only do this after legalization to avoid interfering with other combines
1624 // which might occur.
1626 !DCI.isCalledByLegalizer())
1627 return SDValue();
1628
1629 // We need to permute the operands to get the correct NaN behavior. The
1630 // selected operand is the second one based on the failing compare with NaN,
1631 // so permute it based on the compare type the hardware uses.
1632 if (LHS == True)
1633 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1634 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1635 }
1636 case ISD::SETUGE:
1637 case ISD::SETUGT: {
1638 if (LHS == True)
1639 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1640 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1641 }
1642 case ISD::SETGT:
1643 case ISD::SETGE:
1644 case ISD::SETOGE:
1645 case ISD::SETOGT: {
1647 !DCI.isCalledByLegalizer())
1648 return SDValue();
1649
1650 if (LHS == True)
1651 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1652 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1653 }
1654 case ISD::SETCC_INVALID:
1655 llvm_unreachable("Invalid setcc condcode!");
1656 }
1657 return SDValue();
1658}
1659
1660/// Generate Min/Max node
1662 SDValue LHS, SDValue RHS,
1663 SDValue True, SDValue False,
1664 SDValue CC,
1665 DAGCombinerInfo &DCI) const {
1666 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1667 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1668
1669 SelectionDAG &DAG = DCI.DAG;
1670
1671 // If we can't directly match this, try to see if we can fold an fneg to
1672 // match.
1673
1674 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1675 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1676 SDValue NegTrue = peekFNeg(True);
1677
1678 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1679 // fmin/fmax.
1680 //
1681 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1682 // -> fneg (fmin_legacy lhs, K)
1683 //
1684 // TODO: Use getNegatedExpression
1685 if (LHS == NegTrue && CFalse && CRHS) {
1686 APFloat NegRHS = neg(CRHS->getValueAPF());
1687 if (NegRHS == CFalse->getValueAPF()) {
1688 SDValue Combined =
1689 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1690 if (Combined)
1691 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1692 return SDValue();
1693 }
1694 }
1695
1696 return SDValue();
1697}
1698
1699std::pair<SDValue, SDValue>
1701 SDLoc SL(Op);
1702
1703 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1704
1705 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1706 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1707
1708 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1709 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1710
1711 return std::pair(Lo, Hi);
1712}
1713
1715 SDLoc SL(Op);
1716
1717 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1718 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1719 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1720}
1721
1723 SDLoc SL(Op);
1724
1725 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1726 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1727 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1728}
1729
1730// Split a vector type into two parts. The first part is a power of two vector.
1731// The second part is whatever is left over, and is a scalar if it would
1732// otherwise be a 1-vector.
1733std::pair<EVT, EVT>
1735 EVT LoVT, HiVT;
1736 EVT EltVT = VT.getVectorElementType();
1737 unsigned NumElts = VT.getVectorNumElements();
1738 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1739 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1740 HiVT = NumElts - LoNumElts == 1
1741 ? EltVT
1742 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1743 return std::pair(LoVT, HiVT);
1744}
1745
1746// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1747// scalar.
1748std::pair<SDValue, SDValue>
1750 const EVT &LoVT, const EVT &HiVT,
1751 SelectionDAG &DAG) const {
1753 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1754 N.getValueType().getVectorNumElements() &&
1755 "More vector elements requested than available!");
1757 DAG.getVectorIdxConstant(0, DL));
1758 SDValue Hi = DAG.getNode(
1760 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1761 return std::pair(Lo, Hi);
1762}
1763
1765 SelectionDAG &DAG) const {
1766 LoadSDNode *Load = cast<LoadSDNode>(Op);
1767 EVT VT = Op.getValueType();
1768 SDLoc SL(Op);
1769
1770
1771 // If this is a 2 element vector, we really want to scalarize and not create
1772 // weird 1 element vectors.
1773 if (VT.getVectorNumElements() == 2) {
1774 SDValue Ops[2];
1775 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1776 return DAG.getMergeValues(Ops, SL);
1777 }
1778
1779 SDValue BasePtr = Load->getBasePtr();
1780 EVT MemVT = Load->getMemoryVT();
1781
1782 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1783
1784 EVT LoVT, HiVT;
1785 EVT LoMemVT, HiMemVT;
1786 SDValue Lo, Hi;
1787
1788 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1789 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1790 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1791
1792 unsigned Size = LoMemVT.getStoreSize();
1793 Align BaseAlign = Load->getAlign();
1794 Align HiAlign = commonAlignment(BaseAlign, Size);
1795
1796 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1797 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1798 BaseAlign, Load->getMemOperand()->getFlags());
1799 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1800 SDValue HiLoad =
1801 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1802 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1803 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1804
1805 SDValue Join;
1806 if (LoVT == HiVT) {
1807 // This is the case that the vector is power of two so was evenly split.
1808 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1809 } else {
1810 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1811 DAG.getVectorIdxConstant(0, SL));
1812 Join = DAG.getNode(
1814 VT, Join, HiLoad,
1816 }
1817
1818 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1819 LoLoad.getValue(1), HiLoad.getValue(1))};
1820
1821 return DAG.getMergeValues(Ops, SL);
1822}
1823
1825 SelectionDAG &DAG) const {
1826 LoadSDNode *Load = cast<LoadSDNode>(Op);
1827 EVT VT = Op.getValueType();
1828 SDValue BasePtr = Load->getBasePtr();
1829 EVT MemVT = Load->getMemoryVT();
1830 SDLoc SL(Op);
1831 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1832 Align BaseAlign = Load->getAlign();
1833 unsigned NumElements = MemVT.getVectorNumElements();
1834
1835 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1836 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1837 if (NumElements != 3 ||
1838 (BaseAlign < Align(8) &&
1839 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1840 return SplitVectorLoad(Op, DAG);
1841
1842 assert(NumElements == 3);
1843
1844 EVT WideVT =
1846 EVT WideMemVT =
1848 SDValue WideLoad = DAG.getExtLoad(
1849 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1850 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1851 return DAG.getMergeValues(
1852 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1853 DAG.getVectorIdxConstant(0, SL)),
1854 WideLoad.getValue(1)},
1855 SL);
1856}
1857
1859 SelectionDAG &DAG) const {
1860 StoreSDNode *Store = cast<StoreSDNode>(Op);
1861 SDValue Val = Store->getValue();
1862 EVT VT = Val.getValueType();
1863
1864 // If this is a 2 element vector, we really want to scalarize and not create
1865 // weird 1 element vectors.
1866 if (VT.getVectorNumElements() == 2)
1867 return scalarizeVectorStore(Store, DAG);
1868
1869 EVT MemVT = Store->getMemoryVT();
1870 SDValue Chain = Store->getChain();
1871 SDValue BasePtr = Store->getBasePtr();
1872 SDLoc SL(Op);
1873
1874 EVT LoVT, HiVT;
1875 EVT LoMemVT, HiMemVT;
1876 SDValue Lo, Hi;
1877
1878 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1879 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1880 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1881
1882 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1883
1884 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1885 Align BaseAlign = Store->getAlign();
1886 unsigned Size = LoMemVT.getStoreSize();
1887 Align HiAlign = commonAlignment(BaseAlign, Size);
1888
1889 SDValue LoStore =
1890 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1891 Store->getMemOperand()->getFlags());
1892 SDValue HiStore =
1893 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1894 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1895
1896 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1897}
1898
1899// This is a shortcut for integer division because we have fast i32<->f32
1900// conversions, and fast f32 reciprocal instructions. The fractional part of a
1901// float is enough to accurately represent up to a 24-bit signed integer.
1903 bool Sign) const {
1904 SDLoc DL(Op);
1905 EVT VT = Op.getValueType();
1906 SDValue LHS = Op.getOperand(0);
1907 SDValue RHS = Op.getOperand(1);
1908 MVT IntVT = MVT::i32;
1909 MVT FltVT = MVT::f32;
1910
1911 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1912 if (LHSSignBits < 9)
1913 return SDValue();
1914
1915 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1916 if (RHSSignBits < 9)
1917 return SDValue();
1918
1919 unsigned BitSize = VT.getSizeInBits();
1920 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1921 unsigned DivBits = BitSize - SignBits;
1922 if (Sign)
1923 ++DivBits;
1924
1927
1928 SDValue jq = DAG.getConstant(1, DL, IntVT);
1929
1930 if (Sign) {
1931 // char|short jq = ia ^ ib;
1932 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1933
1934 // jq = jq >> (bitsize - 2)
1935 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1936 DAG.getConstant(BitSize - 2, DL, VT));
1937
1938 // jq = jq | 0x1
1939 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1940 }
1941
1942 // int ia = (int)LHS;
1943 SDValue ia = LHS;
1944
1945 // int ib, (int)RHS;
1946 SDValue ib = RHS;
1947
1948 // float fa = (float)ia;
1949 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1950
1951 // float fb = (float)ib;
1952 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1953
1954 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1955 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1956
1957 // fq = trunc(fq);
1958 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1959
1960 // float fqneg = -fq;
1961 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1962
1964
1965 bool UseFmadFtz = false;
1966 if (Subtarget->isGCN()) {
1968 UseFmadFtz =
1970 }
1971
1972 // float fr = mad(fqneg, fb, fa);
1973 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1974 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1976 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1977
1978 // int iq = (int)fq;
1979 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1980
1981 // fr = fabs(fr);
1982 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1983
1984 // fb = fabs(fb);
1985 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1986
1987 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1988
1989 // int cv = fr >= fb;
1990 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1991
1992 // jq = (cv ? jq : 0);
1993 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1994
1995 // dst = iq + jq;
1996 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1997
1998 // Rem needs compensation, it's easier to recompute it
1999 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2000 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2001
2002 // Truncate to number of bits this divide really is.
2003 if (Sign) {
2004 SDValue InRegSize
2005 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2006 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2007 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2008 } else {
2009 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2010 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2011 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2012 }
2013
2014 return DAG.getMergeValues({ Div, Rem }, DL);
2015}
2016
2018 SelectionDAG &DAG,
2020 SDLoc DL(Op);
2021 EVT VT = Op.getValueType();
2022
2023 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2024
2025 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2026
2027 SDValue One = DAG.getConstant(1, DL, HalfVT);
2028 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2029
2030 //HiLo split
2031 SDValue LHS_Lo, LHS_Hi;
2032 SDValue LHS = Op.getOperand(0);
2033 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2034
2035 SDValue RHS_Lo, RHS_Hi;
2036 SDValue RHS = Op.getOperand(1);
2037 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2038
2039 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2041
2042 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2043 LHS_Lo, RHS_Lo);
2044
2045 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2046 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2047
2048 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2049 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2050 return;
2051 }
2052
2053 if (isTypeLegal(MVT::i64)) {
2054 // The algorithm here is based on ideas from "Software Integer Division",
2055 // Tom Rodeheffer, August 2008.
2056
2059
2060 // Compute denominator reciprocal.
2061 unsigned FMAD =
2062 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2065 : (unsigned)AMDGPUISD::FMAD_FTZ;
2066
2067 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2068 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2069 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2070 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2071 Cvt_Lo);
2072 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2073 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2074 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2075 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2076 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2077 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2078 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2079 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2080 Mul1);
2081 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2082 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2083 SDValue Rcp64 = DAG.getBitcast(VT,
2084 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2085
2086 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2087 SDValue One64 = DAG.getConstant(1, DL, VT);
2088 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2089 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2090
2091 // First round of UNR (Unsigned integer Newton-Raphson).
2092 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2093 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2094 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2095 SDValue Mulhi1_Lo, Mulhi1_Hi;
2096 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2097 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2098 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2099 Mulhi1_Lo, Zero1);
2100 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2101 Mulhi1_Hi, Add1_Lo.getValue(1));
2102 SDValue Add1 = DAG.getBitcast(VT,
2103 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2104
2105 // Second round of UNR.
2106 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2107 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2108 SDValue Mulhi2_Lo, Mulhi2_Hi;
2109 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2110 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2111 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2112 Mulhi2_Lo, Zero1);
2113 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2114 Mulhi2_Hi, Add2_Lo.getValue(1));
2115 SDValue Add2 = DAG.getBitcast(VT,
2116 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2117
2118 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2119
2120 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2121
2122 SDValue Mul3_Lo, Mul3_Hi;
2123 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2124 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2125 Mul3_Lo, Zero1);
2126 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2127 Mul3_Hi, Sub1_Lo.getValue(1));
2128 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2129 SDValue Sub1 = DAG.getBitcast(VT,
2130 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2131
2132 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2133 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2134 ISD::SETUGE);
2135 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2136 ISD::SETUGE);
2137 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2138
2139 // TODO: Here and below portions of the code can be enclosed into if/endif.
2140 // Currently control flow is unconditional and we have 4 selects after
2141 // potential endif to substitute PHIs.
2142
2143 // if C3 != 0 ...
2144 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2145 RHS_Lo, Zero1);
2146 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2147 RHS_Hi, Sub1_Lo.getValue(1));
2148 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2149 Zero, Sub2_Lo.getValue(1));
2150 SDValue Sub2 = DAG.getBitcast(VT,
2151 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2152
2153 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2154
2155 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2156 ISD::SETUGE);
2157 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2158 ISD::SETUGE);
2159 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2160
2161 // if (C6 != 0)
2162 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2163
2164 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2165 RHS_Lo, Zero1);
2166 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2167 RHS_Hi, Sub2_Lo.getValue(1));
2168 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2169 Zero, Sub3_Lo.getValue(1));
2170 SDValue Sub3 = DAG.getBitcast(VT,
2171 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2172
2173 // endif C6
2174 // endif C3
2175
2176 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2177 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2178
2179 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2180 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2181
2182 Results.push_back(Div);
2183 Results.push_back(Rem);
2184
2185 return;
2186 }
2187
2188 // r600 expandion.
2189 // Get Speculative values
2190 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2191 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2192
2193 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2194 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2195 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2196
2197 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2198 SDValue DIV_Lo = Zero;
2199
2200 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2201
2202 for (unsigned i = 0; i < halfBitWidth; ++i) {
2203 const unsigned bitPos = halfBitWidth - i - 1;
2204 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2205 // Get value of high bit
2206 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2207 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2208 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2209
2210 // Shift
2211 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2212 // Add LHS high bit
2213 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2214
2215 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2216 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2217
2218 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2219
2220 // Update REM
2221 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2222 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2223 }
2224
2225 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2226 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2227 Results.push_back(DIV);
2228 Results.push_back(REM);
2229}
2230
2232 SelectionDAG &DAG) const {
2233 SDLoc DL(Op);
2234 EVT VT = Op.getValueType();
2235
2236 if (VT == MVT::i64) {
2238 LowerUDIVREM64(Op, DAG, Results);
2239 return DAG.getMergeValues(Results, DL);
2240 }
2241
2242 if (VT == MVT::i32) {
2243 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2244 return Res;
2245 }
2246
2247 SDValue X = Op.getOperand(0);
2248 SDValue Y = Op.getOperand(1);
2249
2250 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2251 // algorithm used here.
2252
2253 // Initial estimate of inv(y).
2254 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2255
2256 // One round of UNR.
2257 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2258 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2259 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2260 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2261
2262 // Quotient/remainder estimate.
2263 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2264 SDValue R =
2265 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2266
2267 // First quotient/remainder refinement.
2268 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2269 SDValue One = DAG.getConstant(1, DL, VT);
2270 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2271 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2272 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2273 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2274 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2275
2276 // Second quotient/remainder refinement.
2277 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2278 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2279 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2280 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2281 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2282
2283 return DAG.getMergeValues({Q, R}, DL);
2284}
2285
2287 SelectionDAG &DAG) const {
2288 SDLoc DL(Op);
2289 EVT VT = Op.getValueType();
2290
2291 SDValue LHS = Op.getOperand(0);
2292 SDValue RHS = Op.getOperand(1);
2293
2294 SDValue Zero = DAG.getConstant(0, DL, VT);
2295 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2296
2297 if (VT == MVT::i32) {
2298 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2299 return Res;
2300 }
2301
2302 if (VT == MVT::i64 &&
2303 DAG.ComputeNumSignBits(LHS) > 32 &&
2304 DAG.ComputeNumSignBits(RHS) > 32) {
2305 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2306
2307 //HiLo split
2308 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2309 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2310 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2311 LHS_Lo, RHS_Lo);
2312 SDValue Res[2] = {
2313 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2314 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2315 };
2316 return DAG.getMergeValues(Res, DL);
2317 }
2318
2319 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2320 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2321 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2322 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2323
2324 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2325 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2326
2327 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2328 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2329
2330 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2331 SDValue Rem = Div.getValue(1);
2332
2333 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2334 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2335
2336 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2337 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2338
2339 SDValue Res[2] = {
2340 Div,
2341 Rem
2342 };
2343 return DAG.getMergeValues(Res, DL);
2344}
2345
2346// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2348 SDLoc SL(Op);
2349 EVT VT = Op.getValueType();
2350 auto Flags = Op->getFlags();
2351 SDValue X = Op.getOperand(0);
2352 SDValue Y = Op.getOperand(1);
2353
2354 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2355 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2356 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2357 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2358 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2359}
2360
2362 SDLoc SL(Op);
2363 SDValue Src = Op.getOperand(0);
2364
2365 // result = trunc(src)
2366 // if (src > 0.0 && src != result)
2367 // result += 1.0
2368
2369 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2370
2371 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2372 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2373
2374 EVT SetCCVT =
2375 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2376
2377 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2378 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2379 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2380
2381 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2382 // TODO: Should this propagate fast-math-flags?
2383 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2384}
2385
2387 SelectionDAG &DAG) {
2388 const unsigned FractBits = 52;
2389 const unsigned ExpBits = 11;
2390
2391 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2392 Hi,
2393 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2394 DAG.getConstant(ExpBits, SL, MVT::i32));
2395 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2396 DAG.getConstant(1023, SL, MVT::i32));
2397
2398 return Exp;
2399}
2400
2402 SDLoc SL(Op);
2403 SDValue Src = Op.getOperand(0);
2404
2405 assert(Op.getValueType() == MVT::f64);
2406
2407 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2408
2409 // Extract the upper half, since this is where we will find the sign and
2410 // exponent.
2411 SDValue Hi = getHiHalf64(Src, DAG);
2412
2413 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2414
2415 const unsigned FractBits = 52;
2416
2417 // Extract the sign bit.
2418 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2419 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2420
2421 // Extend back to 64-bits.
2422 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2423 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2424
2425 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2426 const SDValue FractMask
2427 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2428
2429 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2430 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2431 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2432
2433 EVT SetCCVT =
2434 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2435
2436 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2437
2438 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2439 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2440
2441 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2442 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2443
2444 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2445}
2446
2448 SelectionDAG &DAG) const {
2449 SDLoc SL(Op);
2450 SDValue Src = Op.getOperand(0);
2451
2452 assert(Op.getValueType() == MVT::f64);
2453
2454 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2455 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2456 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2457
2458 // TODO: Should this propagate fast-math-flags?
2459
2460 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2461 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2462
2463 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2464
2465 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2466 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2467
2468 EVT SetCCVT =
2469 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2470 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2471
2472 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2473}
2474
2476 SelectionDAG &DAG) const {
2477 // FNEARBYINT and FRINT are the same, except in their handling of FP
2478 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2479 // rint, so just treat them as equivalent.
2480 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2481 Op.getOperand(0));
2482}
2483
2485 auto VT = Op.getValueType();
2486 auto Arg = Op.getOperand(0u);
2487 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2488}
2489
2490// XXX - May require not supporting f32 denormals?
2491
2492// Don't handle v2f16. The extra instructions to scalarize and repack around the
2493// compare and vselect end up producing worse code than scalarizing the whole
2494// operation.
2496 SDLoc SL(Op);
2497 SDValue X = Op.getOperand(0);
2498 EVT VT = Op.getValueType();
2499
2500 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2501
2502 // TODO: Should this propagate fast-math-flags?
2503
2504 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2505
2506 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2507
2508 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2509 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2510
2511 EVT SetCCVT =
2512 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2513
2514 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2515 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2516 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2517
2518 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2519 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2520}
2521
2523 SDLoc SL(Op);
2524 SDValue Src = Op.getOperand(0);
2525
2526 // result = trunc(src);
2527 // if (src < 0.0 && src != result)
2528 // result += -1.0.
2529
2530 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2531
2532 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2533 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2534
2535 EVT SetCCVT =
2536 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2537
2538 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2539 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2540 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2541
2542 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2543 // TODO: Should this propagate fast-math-flags?
2544 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2545}
2546
2547/// Return true if it's known that \p Src can never be an f32 denormal value.
2549 switch (Src.getOpcode()) {
2550 case ISD::FP_EXTEND:
2551 return Src.getOperand(0).getValueType() == MVT::f16;
2552 case ISD::FP16_TO_FP:
2553 case ISD::FFREXP:
2554 return true;
2556 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2557 switch (IntrinsicID) {
2558 case Intrinsic::amdgcn_frexp_mant:
2559 return true;
2560 default:
2561 return false;
2562 }
2563 }
2564 default:
2565 return false;
2566 }
2567
2568 llvm_unreachable("covered opcode switch");
2569}
2570
2572 SDNodeFlags Flags) {
2573 if (Flags.hasApproximateFuncs())
2574 return true;
2575 auto &Options = DAG.getTarget().Options;
2576 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2577}
2578
2580 SDValue Src,
2581 SDNodeFlags Flags) {
2582 return !valueIsKnownNeverF32Denorm(Src) &&
2583 DAG.getMachineFunction()
2586}
2587
2589 SDValue Src,
2590 SDNodeFlags Flags) const {
2591 SDLoc SL(Src);
2592 EVT VT = Src.getValueType();
2593 const fltSemantics &Semantics = VT.getFltSemantics();
2594 SDValue SmallestNormal =
2595 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2596
2597 // Want to scale denormals up, but negatives and 0 work just as well on the
2598 // scaled path.
2599 SDValue IsLtSmallestNormal = DAG.getSetCC(
2600 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2601 SmallestNormal, ISD::SETOLT);
2602
2603 return IsLtSmallestNormal;
2604}
2605
2607 SDNodeFlags Flags) const {
2608 SDLoc SL(Src);
2609 EVT VT = Src.getValueType();
2610 const fltSemantics &Semantics = VT.getFltSemantics();
2611 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2612
2613 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2614 SDValue IsFinite = DAG.getSetCC(
2615 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2616 Inf, ISD::SETOLT);
2617 return IsFinite;
2618}
2619
2620/// If denormal handling is required return the scaled input to FLOG2, and the
2621/// check for denormal range. Otherwise, return null values.
2622std::pair<SDValue, SDValue>
2624 SDValue Src, SDNodeFlags Flags) const {
2625 if (!needsDenormHandlingF32(DAG, Src, Flags))
2626 return {};
2627
2628 MVT VT = MVT::f32;
2629 const fltSemantics &Semantics = APFloat::IEEEsingle();
2630 SDValue SmallestNormal =
2631 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2632
2633 SDValue IsLtSmallestNormal = DAG.getSetCC(
2634 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2635 SmallestNormal, ISD::SETOLT);
2636
2637 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2638 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2639 SDValue ScaleFactor =
2640 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2641
2642 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2643 return {ScaledInput, IsLtSmallestNormal};
2644}
2645
2647 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2648 // If we have to handle denormals, scale up the input and adjust the result.
2649
2650 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2651 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2652
2653 SDLoc SL(Op);
2654 EVT VT = Op.getValueType();
2655 SDValue Src = Op.getOperand(0);
2656 SDNodeFlags Flags = Op->getFlags();
2657
2658 if (VT == MVT::f16) {
2659 // Nothing in half is a denormal when promoted to f32.
2660 assert(!Subtarget->has16BitInsts());
2661 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2662 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2663 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2664 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2665 }
2666
2667 auto [ScaledInput, IsLtSmallestNormal] =
2668 getScaledLogInput(DAG, SL, Src, Flags);
2669 if (!ScaledInput)
2670 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2671
2672 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2673
2674 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2675 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2676 SDValue ResultOffset =
2677 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2678 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2679}
2680
2681static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2682 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2683 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2684 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2685}
2686
2688 SelectionDAG &DAG) const {
2689 SDValue X = Op.getOperand(0);
2690 EVT VT = Op.getValueType();
2691 SDNodeFlags Flags = Op->getFlags();
2692 SDLoc DL(Op);
2693
2694 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2695 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2696
2697 const auto &Options = getTargetMachine().Options;
2698 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2699 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2700
2701 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2702 // Log and multiply in f32 is good enough for f16.
2703 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2704 }
2705
2706 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2707 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2708 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2709 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2710 }
2711
2712 return Lowered;
2713 }
2714
2715 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2716 if (ScaledInput)
2717 X = ScaledInput;
2718
2719 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2720
2721 SDValue R;
2722 if (Subtarget->hasFastFMAF32()) {
2723 // c+cc are ln(2)/ln(10) to more than 49 bits
2724 const float c_log10 = 0x1.344134p-2f;
2725 const float cc_log10 = 0x1.09f79ep-26f;
2726
2727 // c + cc is ln(2) to more than 49 bits
2728 const float c_log = 0x1.62e42ep-1f;
2729 const float cc_log = 0x1.efa39ep-25f;
2730
2731 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2732 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2733
2734 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2735 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2736 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2737 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2738 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2739 } else {
2740 // ch+ct is ln(2)/ln(10) to more than 36 bits
2741 const float ch_log10 = 0x1.344000p-2f;
2742 const float ct_log10 = 0x1.3509f6p-18f;
2743
2744 // ch + ct is ln(2) to more than 36 bits
2745 const float ch_log = 0x1.62e000p-1f;
2746 const float ct_log = 0x1.0bfbe8p-15f;
2747
2748 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2749 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2750
2751 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2752 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2753 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2754 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2755 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2756
2757 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2758 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2759 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2760 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2761 }
2762
2763 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2764 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2765
2766 // TODO: Check if known finite from source value.
2767 if (!IsFiniteOnly) {
2768 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2769 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2770 }
2771
2772 if (IsScaled) {
2773 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2774 SDValue ShiftK =
2775 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2776 SDValue Shift =
2777 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2778 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2779 }
2780
2781 return R;
2782}
2783
2785 return LowerFLOGCommon(Op, DAG);
2786}
2787
2788// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2789// promote f16 operation.
2791 SelectionDAG &DAG, bool IsLog10,
2792 SDNodeFlags Flags) const {
2793 EVT VT = Src.getValueType();
2794 unsigned LogOp =
2795 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2796
2797 double Log2BaseInverted =
2799
2800 if (VT == MVT::f32) {
2801 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2802 if (ScaledInput) {
2803 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2804 SDValue ScaledResultOffset =
2805 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2806
2807 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2808
2809 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2810 ScaledResultOffset, Zero, Flags);
2811
2812 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2813
2814 if (Subtarget->hasFastFMAF32())
2815 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2816 Flags);
2817 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2818 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2819 }
2820 }
2821
2822 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2823 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2824
2825 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2826 Flags);
2827}
2828
2830 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2831 // If we have to handle denormals, scale up the input and adjust the result.
2832
2833 SDLoc SL(Op);
2834 EVT VT = Op.getValueType();
2835 SDValue Src = Op.getOperand(0);
2836 SDNodeFlags Flags = Op->getFlags();
2837
2838 if (VT == MVT::f16) {
2839 // Nothing in half is a denormal when promoted to f32.
2840 assert(!Subtarget->has16BitInsts());
2841 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2842 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2843 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2844 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2845 }
2846
2847 assert(VT == MVT::f32);
2848
2849 if (!needsDenormHandlingF32(DAG, Src, Flags))
2850 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2851
2852 // bool needs_scaling = x < -0x1.f80000p+6f;
2853 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2854
2855 // -nextafter(128.0, -1)
2856 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2857
2858 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2859
2860 SDValue NeedsScaling =
2861 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2862
2863 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2864 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2865
2866 SDValue AddOffset =
2867 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2868
2869 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2870 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2871
2872 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2873 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2874 SDValue ResultScale =
2875 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2876
2877 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2878}
2879
2881 SelectionDAG &DAG,
2882 SDNodeFlags Flags) const {
2883 EVT VT = X.getValueType();
2884 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2885
2886 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2887 // exp2(M_LOG2E_F * f);
2888 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2889 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2890 : (unsigned)ISD::FEXP2,
2891 SL, VT, Mul, Flags);
2892 }
2893
2894 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2895
2896 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2897 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2898
2899 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2900
2901 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2902
2903 SDValue AdjustedX =
2904 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2905
2906 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2907
2908 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2909
2910 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2911 SDValue AdjustedResult =
2912 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2913
2914 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2915 Flags);
2916}
2917
2918/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2919/// handled correctly.
2921 SelectionDAG &DAG,
2922 SDNodeFlags Flags) const {
2923 const EVT VT = X.getValueType();
2924 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2925
2926 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2927 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2928 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2929 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2930
2931 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2932 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2933 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2934 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2935 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2936 }
2937
2938 // bool s = x < -0x1.2f7030p+5f;
2939 // x += s ? 0x1.0p+5f : 0.0f;
2940 // exp10 = exp2(x * 0x1.a92000p+1f) *
2941 // exp2(x * 0x1.4f0978p-11f) *
2942 // (s ? 0x1.9f623ep-107f : 1.0f);
2943
2944 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2945
2946 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2947 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2948
2949 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2950 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2951 SDValue AdjustedX =
2952 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2953
2954 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2955 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2956
2957 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2958 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2959 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2960 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2961
2962 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2963
2964 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2965 SDValue AdjustedResult =
2966 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2967
2968 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2969 Flags);
2970}
2971
2973 EVT VT = Op.getValueType();
2974 SDLoc SL(Op);
2975 SDValue X = Op.getOperand(0);
2976 SDNodeFlags Flags = Op->getFlags();
2977 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2978
2979 if (VT.getScalarType() == MVT::f16) {
2980 // v_exp_f16 (fmul x, log2e)
2981 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2982 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2983
2984 if (VT.isVector())
2985 return SDValue();
2986
2987 // exp(f16 x) ->
2988 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2989
2990 // Nothing in half is a denormal when promoted to f32.
2991 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2992 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2993 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2994 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2995 }
2996
2997 assert(VT == MVT::f32);
2998
2999 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3000 // library behavior. Also, is known-not-daz source sufficient?
3001 if (allowApproxFunc(DAG, Flags)) {
3002 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3003 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3004 }
3005
3006 // Algorithm:
3007 //
3008 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3009 //
3010 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3011 // n = 64*m + j, 0 <= j < 64
3012 //
3013 // e^x = 2^((64*m + j + f)/64)
3014 // = (2^m) * (2^(j/64)) * 2^(f/64)
3015 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3016 //
3017 // f = x*(64/ln(2)) - n
3018 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3019 //
3020 // e^x = (2^m) * (2^(j/64)) * e^r
3021 //
3022 // (2^(j/64)) is precomputed
3023 //
3024 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3025 // e^r = 1 + q
3026 //
3027 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3028 //
3029 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3030 SDNodeFlags FlagsNoContract = Flags;
3031 FlagsNoContract.setAllowContract(false);
3032
3033 SDValue PH, PL;
3034 if (Subtarget->hasFastFMAF32()) {
3035 const float c_exp = numbers::log2ef;
3036 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3037 const float c_exp10 = 0x1.a934f0p+1f;
3038 const float cc_exp10 = 0x1.2f346ep-24f;
3039
3040 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3041 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3042
3043 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3044 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3045 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3046 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3047 } else {
3048 const float ch_exp = 0x1.714000p+0f;
3049 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3050
3051 const float ch_exp10 = 0x1.a92000p+1f;
3052 const float cl_exp10 = 0x1.4f0978p-11f;
3053
3054 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3055 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3056
3057 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3058 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3059 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3060 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3061 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3062
3063 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3064
3065 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3066 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3067 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3068 }
3069
3070 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3071
3072 // It is unsafe to contract this fsub into the PH multiply.
3073 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3074
3075 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3076 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3077 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3078
3079 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3080
3081 SDValue UnderflowCheckConst =
3082 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3083
3084 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3085 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3086 SDValue Underflow =
3087 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3088
3089 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3090 const auto &Options = getTargetMachine().Options;
3091
3092 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3093 SDValue OverflowCheckConst =
3094 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3095 SDValue Overflow =
3096 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3097 SDValue Inf =
3099 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3100 }
3101
3102 return R;
3103}
3104
3105static bool isCtlzOpc(unsigned Opc) {
3106 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3107}
3108
3109static bool isCttzOpc(unsigned Opc) {
3110 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3111}
3112
3114 SelectionDAG &DAG) const {
3115 auto SL = SDLoc(Op);
3116 auto Opc = Op.getOpcode();
3117 auto Arg = Op.getOperand(0u);
3118 auto ResultVT = Op.getValueType();
3119
3120 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3121 return {};
3122
3123 assert(isCtlzOpc(Opc));
3124 assert(ResultVT == Arg.getValueType());
3125
3126 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3127 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3128 SDValue NewOp;
3129
3130 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3131 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3132 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3133 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3134 } else {
3135 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3136 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3137 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3138 }
3139
3140 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3141}
3142
3144 SDLoc SL(Op);
3145 SDValue Src = Op.getOperand(0);
3146
3147 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3148 bool Ctlz = isCtlzOpc(Op.getOpcode());
3149 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3150
3151 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3152 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3153 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3154
3155 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3156 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3157 // (cttz hi:lo) -> (umin (ffbl src), 32)
3158 // (ctlz_zero_undef src) -> (ffbh src)
3159 // (cttz_zero_undef src) -> (ffbl src)
3160
3161 // 64-bit scalar version produce 32-bit result
3162 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3163 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3164 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3165 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3166 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3167 if (!ZeroUndef) {
3168 const SDValue ConstVal = DAG.getConstant(
3169 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3170 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3171 }
3172 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3173 }
3174
3175 SDValue Lo, Hi;
3176 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3177
3178 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3179 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3180
3181 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3182 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3183 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3184 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3185
3186 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3187 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3188 if (Ctlz)
3189 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3190 else
3191 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3192
3193 SDValue NewOpr;
3194 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3195 if (!ZeroUndef) {
3196 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3197 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3198 }
3199
3200 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3201}
3202
3204 bool Signed) const {
3205 // The regular method converting a 64-bit integer to float roughly consists of
3206 // 2 steps: normalization and rounding. In fact, after normalization, the
3207 // conversion from a 64-bit integer to a float is essentially the same as the
3208 // one from a 32-bit integer. The only difference is that it has more
3209 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3210 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3211 // converted into the correct float number. The basic steps for the unsigned
3212 // conversion are illustrated in the following pseudo code:
3213 //
3214 // f32 uitofp(i64 u) {
3215 // i32 hi, lo = split(u);
3216 // // Only count the leading zeros in hi as we have native support of the
3217 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3218 // // reduced to a 32-bit one automatically.
3219 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3220 // u <<= shamt;
3221 // hi, lo = split(u);
3222 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3223 // // convert it as a 32-bit integer and scale the result back.
3224 // return uitofp(hi) * 2^(32 - shamt);
3225 // }
3226 //
3227 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3228 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3229 // converted instead followed by negation based its sign bit.
3230
3231 SDLoc SL(Op);
3232 SDValue Src = Op.getOperand(0);
3233
3234 SDValue Lo, Hi;
3235 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3236 SDValue Sign;
3237 SDValue ShAmt;
3238 if (Signed && Subtarget->isGCN()) {
3239 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3240 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3241 // account. That is, the maximal shift is
3242 // - 32 if Lo and Hi have opposite signs;
3243 // - 33 if Lo and Hi have the same sign.
3244 //
3245 // Or, MaxShAmt = 33 + OppositeSign, where
3246 //
3247 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3248 // - -1 if Lo and Hi have opposite signs; and
3249 // - 0 otherwise.
3250 //
3251 // All in all, ShAmt is calculated as
3252 //
3253 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3254 //
3255 // or
3256 //
3257 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3258 //
3259 // to reduce the critical path.
3260 SDValue OppositeSign = DAG.getNode(
3261 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3262 DAG.getConstant(31, SL, MVT::i32));
3263 SDValue MaxShAmt =
3264 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3265 OppositeSign);
3266 // Count the leading sign bits.
3267 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3268 // Different from unsigned conversion, the shift should be one bit less to
3269 // preserve the sign bit.
3270 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3271 DAG.getConstant(1, SL, MVT::i32));
3272 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3273 } else {
3274 if (Signed) {
3275 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3276 // absolute value first.
3277 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3278 DAG.getConstant(63, SL, MVT::i64));
3279 SDValue Abs =
3280 DAG.getNode(ISD::XOR, SL, MVT::i64,
3281 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3282 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3283 }
3284 // Count the leading zeros.
3285 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3286 // The shift amount for signed integers is [0, 32].
3287 }
3288 // Normalize the given 64-bit integer.
3289 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3290 // Split it again.
3291 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3292 // Calculate the adjust bit for rounding.
3293 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3294 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3295 DAG.getConstant(1, SL, MVT::i32), Lo);
3296 // Get the 32-bit normalized integer.
3297 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3298 // Convert the normalized 32-bit integer into f32.
3299 unsigned Opc =
3300 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3301 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3302
3303 // Finally, need to scale back the converted floating number as the original
3304 // 64-bit integer is converted as a 32-bit one.
3305 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3306 ShAmt);
3307 // On GCN, use LDEXP directly.
3308 if (Subtarget->isGCN())
3309 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3310
3311 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3312 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3313 // exponent is enough to avoid overflowing into the sign bit.
3314 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3315 DAG.getConstant(23, SL, MVT::i32));
3316 SDValue IVal =
3317 DAG.getNode(ISD::ADD, SL, MVT::i32,
3318 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3319 if (Signed) {
3320 // Set the sign bit.
3321 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3322 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3323 DAG.getConstant(31, SL, MVT::i32));
3324 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3325 }
3326 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3327}
3328
3330 bool Signed) const {
3331 SDLoc SL(Op);
3332 SDValue Src = Op.getOperand(0);
3333
3334 SDValue Lo, Hi;
3335 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3336
3338 SL, MVT::f64, Hi);
3339
3340 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3341
3342 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3343 DAG.getConstant(32, SL, MVT::i32));
3344 // TODO: Should this propagate fast-math-flags?
3345 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3346}
3347
3349 SelectionDAG &DAG) const {
3350 // TODO: Factor out code common with LowerSINT_TO_FP.
3351 EVT DestVT = Op.getValueType();
3352 SDValue Src = Op.getOperand(0);
3353 EVT SrcVT = Src.getValueType();
3354
3355 if (SrcVT == MVT::i16) {
3356 if (DestVT == MVT::f16)
3357 return Op;
3358 SDLoc DL(Op);
3359
3360 // Promote src to i32
3361 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3362 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3363 }
3364
3365 if (DestVT == MVT::bf16) {
3366 SDLoc SL(Op);
3367 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3368 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3369 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3370 }
3371
3372 if (SrcVT != MVT::i64)
3373 return Op;
3374
3375 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3376 SDLoc DL(Op);
3377
3378 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3379 SDValue FPRoundFlag =
3380 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3381 SDValue FPRound =
3382 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3383
3384 return FPRound;
3385 }
3386
3387 if (DestVT == MVT::f32)
3388 return LowerINT_TO_FP32(Op, DAG, false);
3389
3390 assert(DestVT == MVT::f64);
3391 return LowerINT_TO_FP64(Op, DAG, false);
3392}
3393
3395 SelectionDAG &DAG) const {
3396 EVT DestVT = Op.getValueType();
3397
3398 SDValue Src = Op.getOperand(0);
3399 EVT SrcVT = Src.getValueType();
3400
3401 if (SrcVT == MVT::i16) {
3402 if (DestVT == MVT::f16)
3403 return Op;
3404
3405 SDLoc DL(Op);
3406 // Promote src to i32
3407 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3408 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3409 }
3410
3411 if (DestVT == MVT::bf16) {
3412 SDLoc SL(Op);
3413 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3414 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3415 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3416 }
3417
3418 if (SrcVT != MVT::i64)
3419 return Op;
3420
3421 // TODO: Factor out code common with LowerUINT_TO_FP.
3422
3423 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3424 SDLoc DL(Op);
3425 SDValue Src = Op.getOperand(0);
3426
3427 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3428 SDValue FPRoundFlag =
3429 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3430 SDValue FPRound =
3431 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3432
3433 return FPRound;
3434 }
3435
3436 if (DestVT == MVT::f32)
3437 return LowerINT_TO_FP32(Op, DAG, true);
3438
3439 assert(DestVT == MVT::f64);
3440 return LowerINT_TO_FP64(Op, DAG, true);
3441}
3442
3444 bool Signed) const {
3445 SDLoc SL(Op);
3446
3447 SDValue Src = Op.getOperand(0);
3448 EVT SrcVT = Src.getValueType();
3449
3450 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3451
3452 // The basic idea of converting a floating point number into a pair of 32-bit
3453 // integers is illustrated as follows:
3454 //
3455 // tf := trunc(val);
3456 // hif := floor(tf * 2^-32);
3457 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3458 // hi := fptoi(hif);
3459 // lo := fptoi(lof);
3460 //
3461 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3462 SDValue Sign;
3463 if (Signed && SrcVT == MVT::f32) {
3464 // However, a 32-bit floating point number has only 23 bits mantissa and
3465 // it's not enough to hold all the significant bits of `lof` if val is
3466 // negative. To avoid the loss of precision, We need to take the absolute
3467 // value after truncating and flip the result back based on the original
3468 // signedness.
3469 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3470 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3471 DAG.getConstant(31, SL, MVT::i32));
3472 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3473 }
3474
3475 SDValue K0, K1;
3476 if (SrcVT == MVT::f64) {
3477 K0 = DAG.getConstantFP(
3478 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3479 SrcVT);
3480 K1 = DAG.getConstantFP(
3481 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3482 SrcVT);
3483 } else {
3484 K0 = DAG.getConstantFP(
3485 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3486 K1 = DAG.getConstantFP(
3487 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3488 }
3489 // TODO: Should this propagate fast-math-flags?
3490 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3491
3492 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3493
3494 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3495
3496 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3498 SL, MVT::i32, FloorMul);
3499 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3500
3501 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3502 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3503
3504 if (Signed && SrcVT == MVT::f32) {
3505 assert(Sign);
3506 // Flip the result based on the signedness, which is either all 0s or 1s.
3507 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3508 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3509 // r := xor(r, sign) - sign;
3510 Result =
3511 DAG.getNode(ISD::SUB, SL, MVT::i64,
3512 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3513 }
3514
3515 return Result;
3516}
3517
3519 SDLoc DL(Op);
3520 SDValue N0 = Op.getOperand(0);
3521
3522 // Convert to target node to get known bits
3523 if (N0.getValueType() == MVT::f32)
3524 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3525
3526 if (getTargetMachine().Options.UnsafeFPMath) {
3527 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3528 return SDValue();
3529 }
3530
3531 assert(N0.getSimpleValueType() == MVT::f64);
3532
3533 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3534 const unsigned ExpMask = 0x7ff;
3535 const unsigned ExpBiasf64 = 1023;
3536 const unsigned ExpBiasf16 = 15;
3537 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3538 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3539 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3540 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3541 DAG.getConstant(32, DL, MVT::i64));
3542 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3543 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3544 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3545 DAG.getConstant(20, DL, MVT::i64));
3546 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3547 DAG.getConstant(ExpMask, DL, MVT::i32));
3548 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3549 // add the f16 bias (15) to get the biased exponent for the f16 format.
3550 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3551 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3552
3553 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3554 DAG.getConstant(8, DL, MVT::i32));
3555 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3556 DAG.getConstant(0xffe, DL, MVT::i32));
3557
3558 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3559 DAG.getConstant(0x1ff, DL, MVT::i32));
3560 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3561
3562 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3563 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3564
3565 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3566 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3567 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3568 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3569
3570 // N = M | (E << 12);
3571 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3572 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3573 DAG.getConstant(12, DL, MVT::i32)));
3574
3575 // B = clamp(1-E, 0, 13);
3576 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3577 One, E);
3578 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3579 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3580 DAG.getConstant(13, DL, MVT::i32));
3581
3582 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3583 DAG.getConstant(0x1000, DL, MVT::i32));
3584
3585 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3586 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3587 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3588 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3589
3590 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3591 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3592 DAG.getConstant(0x7, DL, MVT::i32));
3593 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3594 DAG.getConstant(2, DL, MVT::i32));
3595 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3596 One, Zero, ISD::SETEQ);
3597 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3598 One, Zero, ISD::SETGT);
3599 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3600 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3601
3602 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3603 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3604 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3605 I, V, ISD::SETEQ);
3606
3607 // Extract the sign bit.
3608 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3609 DAG.getConstant(16, DL, MVT::i32));
3610 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3611 DAG.getConstant(0x8000, DL, MVT::i32));
3612
3613 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3614 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3615}
3616
3618 SelectionDAG &DAG) const {
3619 SDValue Src = Op.getOperand(0);
3620 unsigned OpOpcode = Op.getOpcode();
3621 EVT SrcVT = Src.getValueType();
3622 EVT DestVT = Op.getValueType();
3623
3624 // Will be selected natively
3625 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3626 return Op;
3627
3628 if (SrcVT == MVT::bf16) {
3629 SDLoc DL(Op);
3630 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3631 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3632 }
3633
3634 // Promote i16 to i32
3635 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3636 SDLoc DL(Op);
3637
3638 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3639 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3640 }
3641
3642 if (DestVT != MVT::i64)
3643 return Op;
3644
3645 if (SrcVT == MVT::f16 ||
3646 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3647 SDLoc DL(Op);
3648
3649 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3650 unsigned Ext =
3652 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3653 }
3654
3655 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3656 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3657
3658 return SDValue();
3659}
3660
3662 SelectionDAG &DAG) const {
3663 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3664 MVT VT = Op.getSimpleValueType();
3665 MVT ScalarVT = VT.getScalarType();
3666
3667 assert(VT.isVector());
3668
3669 SDValue Src = Op.getOperand(0);
3670 SDLoc DL(Op);
3671
3672 // TODO: Don't scalarize on Evergreen?
3673 unsigned NElts = VT.getVectorNumElements();
3675 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3676
3677 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3678 for (unsigned I = 0; I < NElts; ++I)
3679 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3680
3681 return DAG.getBuildVector(VT, DL, Args);
3682}
3683
3684//===----------------------------------------------------------------------===//
3685// Custom DAG optimizations
3686//===----------------------------------------------------------------------===//
3687
3688static bool isU24(SDValue Op, SelectionDAG &DAG) {
3689 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3690}
3691
3692static bool isI24(SDValue Op, SelectionDAG &DAG) {
3693 EVT VT = Op.getValueType();
3694 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3695 // as unsigned 24-bit values.
3697}
3698
3701 SelectionDAG &DAG = DCI.DAG;
3702 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3703 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3704
3705 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3706 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3707 unsigned NewOpcode = Node24->getOpcode();
3708 if (IsIntrin) {
3709 unsigned IID = Node24->getConstantOperandVal(0);
3710 switch (IID) {
3711 case Intrinsic::amdgcn_mul_i24:
3712 NewOpcode = AMDGPUISD::MUL_I24;
3713 break;
3714 case Intrinsic::amdgcn_mul_u24:
3715 NewOpcode = AMDGPUISD::MUL_U24;
3716 break;
3717 case Intrinsic::amdgcn_mulhi_i24:
3718 NewOpcode = AMDGPUISD::MULHI_I24;
3719 break;
3720 case Intrinsic::amdgcn_mulhi_u24:
3721 NewOpcode = AMDGPUISD::MULHI_U24;
3722 break;
3723 default:
3724 llvm_unreachable("Expected 24-bit mul intrinsic");
3725 }
3726 }
3727
3728 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3729
3730 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3731 // the operands to have other uses, but will only perform simplifications that
3732 // involve bypassing some nodes for this user.
3733 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3734 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3735 if (DemandedLHS || DemandedRHS)
3736 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3737 DemandedLHS ? DemandedLHS : LHS,
3738 DemandedRHS ? DemandedRHS : RHS);
3739
3740 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3741 // operands if this node is the only user.
3742 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3743 return SDValue(Node24, 0);
3744 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3745 return SDValue(Node24, 0);
3746
3747 return SDValue();
3748}
3749
3750template <typename IntTy>
3752 uint32_t Width, const SDLoc &DL) {
3753 if (Width + Offset < 32) {
3754 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3755 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3756 return DAG.getConstant(Result, DL, MVT::i32);
3757 }
3758
3759 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3760}
3761
3762static bool hasVolatileUser(SDNode *Val) {
3763 for (SDNode *U : Val->uses()) {
3764 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3765 if (M->isVolatile())
3766 return true;
3767 }
3768 }
3769
3770 return false;
3771}
3772
3774 // i32 vectors are the canonical memory type.
3775 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3776 return false;
3777
3778 if (!VT.isByteSized())
3779 return false;
3780
3781 unsigned Size = VT.getStoreSize();
3782
3783 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3784 return false;
3785
3786 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3787 return false;
3788
3789 return true;
3790}
3791
3792// Replace load of an illegal type with a store of a bitcast to a friendlier
3793// type.
3795 DAGCombinerInfo &DCI) const {
3796 if (!DCI.isBeforeLegalize())
3797 return SDValue();
3798
3799 LoadSDNode *LN = cast<LoadSDNode>(N);
3800 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3801 return SDValue();
3802
3803 SDLoc SL(N);
3804 SelectionDAG &DAG = DCI.DAG;
3805 EVT VT = LN->getMemoryVT();
3806
3807 unsigned Size = VT.getStoreSize();
3808 Align Alignment = LN->getAlign();
3809 if (Alignment < Size && isTypeLegal(VT)) {
3810 unsigned IsFast;
3811 unsigned AS = LN->getAddressSpace();
3812
3813 // Expand unaligned loads earlier than legalization. Due to visitation order
3814 // problems during legalization, the emitted instructions to pack and unpack
3815 // the bytes again are not eliminated in the case of an unaligned copy.
3817 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3818 if (VT.isVector())
3819 return SplitVectorLoad(SDValue(LN, 0), DAG);
3820
3821 SDValue Ops[2];
3822 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3823
3824 return DAG.getMergeValues(Ops, SDLoc(N));
3825 }
3826
3827 if (!IsFast)
3828 return SDValue();
3829 }
3830
3831 if (!shouldCombineMemoryType(VT))
3832 return SDValue();
3833
3834 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3835
3836 SDValue NewLoad
3837 = DAG.getLoad(NewVT, SL, LN->getChain(),
3838 LN->getBasePtr(), LN->getMemOperand());
3839
3840 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3841 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3842 return SDValue(N, 0);
3843}
3844
3845// Replace store of an illegal type with a store of a bitcast to a friendlier
3846// type.
3848 DAGCombinerInfo &DCI) const {
3849 if (!DCI.isBeforeLegalize())
3850 return SDValue();
3851
3852 StoreSDNode *SN = cast<StoreSDNode>(N);
3853 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3854 return SDValue();
3855
3856 EVT VT = SN->getMemoryVT();
3857 unsigned Size = VT.getStoreSize();
3858
3859 SDLoc SL(N);
3860 SelectionDAG &DAG = DCI.DAG;
3861 Align Alignment = SN->getAlign();
3862 if (Alignment < Size && isTypeLegal(VT)) {
3863 unsigned IsFast;
3864 unsigned AS = SN->getAddressSpace();
3865
3866 // Expand unaligned stores earlier than legalization. Due to visitation
3867 // order problems during legalization, the emitted instructions to pack and
3868 // unpack the bytes again are not eliminated in the case of an unaligned
3869 // copy.
3871 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3872 if (VT.isVector())
3873 return SplitVectorStore(SDValue(SN, 0), DAG);
3874
3875 return