LLVM 20.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
53}
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
371 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
372 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
373
374 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
375 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
376 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
377 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
378 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
379 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
380 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
381 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
382
383 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
384 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
385
387
388 // For R600, this is totally unsupported, just custom lower to produce an
389 // error.
391
392 // Library functions. These default to Expand, but we have instructions
393 // for them.
396 MVT::f32, Legal);
397
399 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
401 {MVT::f16, MVT::f32, MVT::f64}, Expand);
402
405 Custom);
406
407 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
408
409 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
410
411 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
412 Expand);
413
414 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
415
416 if (Subtarget->has16BitInsts())
417 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
418 else {
419 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
421 }
422
424 Custom);
425
426 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
427 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
428 // default unless marked custom/legal.
431 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
432 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
433 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
434 Custom);
435
436 // Expand to fneg + fadd.
438
440 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
441 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
442 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
443 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
444 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
445 Custom);
446
447 // FIXME: Why is v8f16/v8bf16 missing?
450 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
451 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
452 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
453 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
454 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
455 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
456 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
457 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
458 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
459 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
460 Custom);
461
463 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
464
465 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
466 for (MVT VT : ScalarIntVTs) {
467 // These should use [SU]DIVREM, so set them to expand
469 Expand);
470
471 // GPU does not have divrem function for signed or unsigned.
473
474 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
476
478
479 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
481 }
482
483 // The hardware supports 32-bit FSHR, but not FSHL.
485
486 // The hardware supports 32-bit ROTR, but not ROTL.
487 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
489
491
495 MVT::i64, Custom);
497
499 Legal);
500
503 MVT::i64, Custom);
504
505 for (auto VT : {MVT::i8, MVT::i16})
507
508 static const MVT::SimpleValueType VectorIntTypes[] = {
509 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
510 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
511
512 for (MVT VT : VectorIntTypes) {
513 // Expand the following operations for the current type by default.
525 ISD::SETCC},
526 VT, Expand);
527 }
528
529 static const MVT::SimpleValueType FloatVectorTypes[] = {
530 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
531 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
532
533 for (MVT VT : FloatVectorTypes) {
546 VT, Expand);
547 }
548
549 // This causes using an unrolled select operation rather than expansion with
550 // bit operations. This is in general better, but the alternative using BFI
551 // instructions may be better if the select sources are SGPRs.
553 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
554
556 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
557
559 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
560
562 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
563
565 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
566
568 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
569
571 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
572
574 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
575
577 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
581
583 setJumpIsExpensive(true);
584
585 // FIXME: This is only partially true. If we have to do vector compares, any
586 // SGPR pair can be a condition register. If we have a uniform condition, we
587 // are better off doing SALU operations, where there is only one SCC. For now,
588 // we don't have a way of knowing during instruction selection if a condition
589 // will be uniform and we always use vector compares. Assume we are using
590 // vector compares until that is fixed.
592
595
597
598 // We want to find all load dependencies for long chains of stores to enable
599 // merging into very wide vectors. The problem is with vectors with > 4
600 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
601 // vectors are a legal type, even though we have to split the loads
602 // usually. When we can more precisely specify load legality per address
603 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
604 // smarter so that they can figure out what to do in 2 iterations without all
605 // N > 4 stores on the same chain.
607
608 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
609 // about these during lowering.
610 MaxStoresPerMemcpy = 0xffffffff;
611 MaxStoresPerMemmove = 0xffffffff;
612 MaxStoresPerMemset = 0xffffffff;
613
614 // The expansion for 64-bit division is enormous.
616 addBypassSlowDiv(64, 32);
617
628
632}
633
635 if (getTargetMachine().Options.NoSignedZerosFPMath)
636 return true;
637
638 const auto Flags = Op.getNode()->getFlags();
639 if (Flags.hasNoSignedZeros())
640 return true;
641
642 return false;
643}
644
645//===----------------------------------------------------------------------===//
646// Target Information
647//===----------------------------------------------------------------------===//
648
650static bool fnegFoldsIntoOpcode(unsigned Opc) {
651 switch (Opc) {
652 case ISD::FADD:
653 case ISD::FSUB:
654 case ISD::FMUL:
655 case ISD::FMA:
656 case ISD::FMAD:
657 case ISD::FMINNUM:
658 case ISD::FMAXNUM:
661 case ISD::FMINIMUM:
662 case ISD::FMAXIMUM:
663 case ISD::SELECT:
664 case ISD::FSIN:
665 case ISD::FTRUNC:
666 case ISD::FRINT:
667 case ISD::FNEARBYINT:
668 case ISD::FROUNDEVEN:
670 case AMDGPUISD::RCP:
677 case AMDGPUISD::FMED3:
678 // TODO: handle llvm.amdgcn.fma.legacy
679 return true;
680 case ISD::BITCAST:
681 llvm_unreachable("bitcast is special cased");
682 default:
683 return false;
684 }
685}
686
687static bool fnegFoldsIntoOp(const SDNode *N) {
688 unsigned Opc = N->getOpcode();
689 if (Opc == ISD::BITCAST) {
690 // TODO: Is there a benefit to checking the conditions performFNegCombine
691 // does? We don't for the other cases.
692 SDValue BCSrc = N->getOperand(0);
693 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
694 return BCSrc.getNumOperands() == 2 &&
695 BCSrc.getOperand(1).getValueSizeInBits() == 32;
696 }
697
698 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
699 }
700
701 return fnegFoldsIntoOpcode(Opc);
702}
703
704/// \p returns true if the operation will definitely need to use a 64-bit
705/// encoding, and thus will use a VOP3 encoding regardless of the source
706/// modifiers.
708static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
709 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
710 VT == MVT::f64;
711}
712
713/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
714/// type for ISD::SELECT.
716static bool selectSupportsSourceMods(const SDNode *N) {
717 // TODO: Only applies if select will be vector
718 return N->getValueType(0) == MVT::f32;
719}
720
721// Most FP instructions support source modifiers, but this could be refined
722// slightly.
724static bool hasSourceMods(const SDNode *N) {
725 if (isa<MemSDNode>(N))
726 return false;
727
728 switch (N->getOpcode()) {
729 case ISD::CopyToReg:
730 case ISD::FDIV:
731 case ISD::FREM:
732 case ISD::INLINEASM:
736
737 // TODO: Should really be looking at the users of the bitcast. These are
738 // problematic because bitcasts are used to legalize all stores to integer
739 // types.
740 case ISD::BITCAST:
741 return false;
743 switch (N->getConstantOperandVal(0)) {
744 case Intrinsic::amdgcn_interp_p1:
745 case Intrinsic::amdgcn_interp_p2:
746 case Intrinsic::amdgcn_interp_mov:
747 case Intrinsic::amdgcn_interp_p1_f16:
748 case Intrinsic::amdgcn_interp_p2_f16:
749 return false;
750 default:
751 return true;
752 }
753 }
754 case ISD::SELECT:
756 default:
757 return true;
758 }
759}
760
762 unsigned CostThreshold) {
763 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
764 // it is truly free to use a source modifier in all cases. If there are
765 // multiple users but for each one will necessitate using VOP3, there will be
766 // a code size increase. Try to avoid increasing code size unless we know it
767 // will save on the instruction count.
768 unsigned NumMayIncreaseSize = 0;
769 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
770
771 assert(!N->use_empty());
772
773 // XXX - Should this limit number of uses to check?
774 for (const SDNode *U : N->uses()) {
775 if (!hasSourceMods(U))
776 return false;
777
778 if (!opMustUseVOP3Encoding(U, VT)) {
779 if (++NumMayIncreaseSize > CostThreshold)
780 return false;
781 }
782 }
783
784 return true;
785}
786
788 ISD::NodeType ExtendKind) const {
789 assert(!VT.isVector() && "only scalar expected");
790
791 // Round to the next multiple of 32-bits.
792 unsigned Size = VT.getSizeInBits();
793 if (Size <= 32)
794 return MVT::i32;
795 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
796}
797
799 return MVT::i32;
800}
801
803 return true;
804}
805
806// The backend supports 32 and 64 bit floating point immediates.
807// FIXME: Why are we reporting vectors of FP immediates as legal?
809 bool ForCodeSize) const {
810 EVT ScalarVT = VT.getScalarType();
811 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
812 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
813}
814
815// We don't want to shrink f64 / f32 constants.
817 EVT ScalarVT = VT.getScalarType();
818 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
819}
820
822 ISD::LoadExtType ExtTy,
823 EVT NewVT) const {
824 // TODO: This may be worth removing. Check regression tests for diffs.
826 return false;
827
828 unsigned NewSize = NewVT.getStoreSizeInBits();
829
830 // If we are reducing to a 32-bit load or a smaller multi-dword load,
831 // this is always better.
832 if (NewSize >= 32)
833 return true;
834
835 EVT OldVT = N->getValueType(0);
836 unsigned OldSize = OldVT.getStoreSizeInBits();
837
838 MemSDNode *MN = cast<MemSDNode>(N);
839 unsigned AS = MN->getAddressSpace();
840 // Do not shrink an aligned scalar load to sub-dword.
841 // Scalar engine cannot do sub-dword loads.
842 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
843 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
846 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
847 MN->isInvariant())) &&
849 return false;
850
851 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
852 // extloads, so doing one requires using a buffer_load. In cases where we
853 // still couldn't use a scalar load, using the wider load shouldn't really
854 // hurt anything.
855
856 // If the old size already had to be an extload, there's no harm in continuing
857 // to reduce the width.
858 return (OldSize < 32);
859}
860
862 const SelectionDAG &DAG,
863 const MachineMemOperand &MMO) const {
864
865 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
866
867 if (LoadTy.getScalarType() == MVT::i32)
868 return false;
869
870 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
871 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
872
873 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
874 return false;
875
876 unsigned Fast = 0;
878 CastTy, MMO, &Fast) &&
879 Fast;
880}
881
882// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
883// profitable with the expansion for 64-bit since it's generally good to
884// speculate things.
886 return true;
887}
888
890 return true;
891}
892
894 switch (N->getOpcode()) {
895 case ISD::EntryToken:
896 case ISD::TokenFactor:
897 return true;
899 unsigned IntrID = N->getConstantOperandVal(0);
901 }
902 case ISD::LOAD:
903 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
905 return true;
906 return false;
907 case AMDGPUISD::SETCC: // ballot-style instruction
908 return true;
909 }
910 return false;
911}
912
914 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
915 NegatibleCost &Cost, unsigned Depth) const {
916
917 switch (Op.getOpcode()) {
918 case ISD::FMA:
919 case ISD::FMAD: {
920 // Negating a fma is not free if it has users without source mods.
921 if (!allUsesHaveSourceMods(Op.getNode()))
922 return SDValue();
923 break;
924 }
925 case AMDGPUISD::RCP: {
926 SDValue Src = Op.getOperand(0);
927 EVT VT = Op.getValueType();
928 SDLoc SL(Op);
929
930 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
931 ForCodeSize, Cost, Depth + 1);
932 if (NegSrc)
933 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
934 return SDValue();
935 }
936 default:
937 break;
938 }
939
940 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
941 ForCodeSize, Cost, Depth);
942}
943
944//===---------------------------------------------------------------------===//
945// Target Properties
946//===---------------------------------------------------------------------===//
947
950
951 // Packed operations do not have a fabs modifier.
952 return VT == MVT::f32 || VT == MVT::f64 ||
953 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
954}
955
958 // Report this based on the end legalized type.
959 VT = VT.getScalarType();
960 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
961}
962
964 unsigned NumElem,
965 unsigned AS) const {
966 return true;
967}
968
970 // There are few operations which truly have vector input operands. Any vector
971 // operation is going to involve operations on each component, and a
972 // build_vector will be a copy per element, so it always makes sense to use a
973 // build_vector input in place of the extracted element to avoid a copy into a
974 // super register.
975 //
976 // We should probably only do this if all users are extracts only, but this
977 // should be the common case.
978 return true;
979}
980
982 // Truncate is just accessing a subregister.
983
984 unsigned SrcSize = Source.getSizeInBits();
985 unsigned DestSize = Dest.getSizeInBits();
986
987 return DestSize < SrcSize && DestSize % 32 == 0 ;
988}
989
991 // Truncate is just accessing a subregister.
992
993 unsigned SrcSize = Source->getScalarSizeInBits();
994 unsigned DestSize = Dest->getScalarSizeInBits();
995
996 if (DestSize== 16 && Subtarget->has16BitInsts())
997 return SrcSize >= 32;
998
999 return DestSize < SrcSize && DestSize % 32 == 0;
1000}
1001
1003 unsigned SrcSize = Src->getScalarSizeInBits();
1004 unsigned DestSize = Dest->getScalarSizeInBits();
1005
1006 if (SrcSize == 16 && Subtarget->has16BitInsts())
1007 return DestSize >= 32;
1008
1009 return SrcSize == 32 && DestSize == 64;
1010}
1011
1013 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1014 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1015 // this will enable reducing 64-bit operations the 32-bit, which is always
1016 // good.
1017
1018 if (Src == MVT::i16)
1019 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1020
1021 return Src == MVT::i32 && Dest == MVT::i64;
1022}
1023
1025 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1026 // limited number of native 64-bit operations. Shrinking an operation to fit
1027 // in a single 32-bit register should always be helpful. As currently used,
1028 // this is much less general than the name suggests, and is only used in
1029 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1030 // not profitable, and may actually be harmful.
1031 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1032}
1033
1035 const SDNode* N, CombineLevel Level) const {
1036 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1037 N->getOpcode() == ISD::SRL) &&
1038 "Expected shift op");
1039 // Always commute pre-type legalization and right shifts.
1040 // We're looking for shl(or(x,y),z) patterns.
1042 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1043 return true;
1044
1045 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1046 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1047 (N->use_begin()->getOpcode() == ISD::SRA ||
1048 N->use_begin()->getOpcode() == ISD::SRL))
1049 return false;
1050
1051 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1052 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1053 if (LHS.getOpcode() != ISD::SHL)
1054 return false;
1055 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1056 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1057 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1058 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1059 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1060 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1061 };
1062 SDValue LHS = N->getOperand(0).getOperand(0);
1063 SDValue RHS = N->getOperand(0).getOperand(1);
1064 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1065}
1066
1067//===---------------------------------------------------------------------===//
1068// TargetLowering Callbacks
1069//===---------------------------------------------------------------------===//
1070
1072 bool IsVarArg) {
1073 switch (CC) {
1081 return CC_AMDGPU;
1084 return CC_AMDGPU_CS_CHAIN;
1085 case CallingConv::C:
1086 case CallingConv::Fast:
1087 case CallingConv::Cold:
1088 return CC_AMDGPU_Func;
1090 return CC_SI_Gfx;
1093 default:
1094 report_fatal_error("Unsupported calling convention for call");
1095 }
1096}
1097
1099 bool IsVarArg) {
1100 switch (CC) {
1103 llvm_unreachable("kernels should not be handled here");
1113 return RetCC_SI_Shader;
1115 return RetCC_SI_Gfx;
1116 case CallingConv::C:
1117 case CallingConv::Fast:
1118 case CallingConv::Cold:
1119 return RetCC_AMDGPU_Func;
1120 default:
1121 report_fatal_error("Unsupported calling convention.");
1122 }
1123}
1124
1125/// The SelectionDAGBuilder will automatically promote function arguments
1126/// with illegal types. However, this does not work for the AMDGPU targets
1127/// since the function arguments are stored in memory as these illegal types.
1128/// In order to handle this properly we need to get the original types sizes
1129/// from the LLVM IR Function and fixup the ISD:InputArg values before
1130/// passing them to AnalyzeFormalArguments()
1131
1132/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1133/// input values across multiple registers. Each item in the Ins array
1134/// represents a single value that will be stored in registers. Ins[x].VT is
1135/// the value type of the value that will be stored in the register, so
1136/// whatever SDNode we lower the argument to needs to be this type.
1137///
1138/// In order to correctly lower the arguments we need to know the size of each
1139/// argument. Since Ins[x].VT gives us the size of the register that will
1140/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1141/// for the original function argument so that we can deduce the correct memory
1142/// type to use for Ins[x]. In most cases the correct memory type will be
1143/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1144/// we have a kernel argument of type v8i8, this argument will be split into
1145/// 8 parts and each part will be represented by its own item in the Ins array.
1146/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1147/// the argument before it was split. From this, we deduce that the memory type
1148/// for each individual part is i8. We pass the memory type as LocVT to the
1149/// calling convention analysis function and the register type (Ins[x].VT) as
1150/// the ValVT.
1152 CCState &State,
1153 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1154 const MachineFunction &MF = State.getMachineFunction();
1155 const Function &Fn = MF.getFunction();
1156 LLVMContext &Ctx = Fn.getParent()->getContext();
1157 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1158 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1160
1161 Align MaxAlign = Align(1);
1162 uint64_t ExplicitArgOffset = 0;
1163 const DataLayout &DL = Fn.getDataLayout();
1164
1165 unsigned InIndex = 0;
1166
1167 for (const Argument &Arg : Fn.args()) {
1168 const bool IsByRef = Arg.hasByRefAttr();
1169 Type *BaseArgTy = Arg.getType();
1170 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1171 Align Alignment = DL.getValueOrABITypeAlignment(
1172 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1173 MaxAlign = std::max(Alignment, MaxAlign);
1174 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1175
1176 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1177 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1178
1179 // We're basically throwing away everything passed into us and starting over
1180 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1181 // to us as computed in Ins.
1182 //
1183 // We also need to figure out what type legalization is trying to do to get
1184 // the correct memory offsets.
1185
1186 SmallVector<EVT, 16> ValueVTs;
1188 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1189
1190 for (unsigned Value = 0, NumValues = ValueVTs.size();
1191 Value != NumValues; ++Value) {
1192 uint64_t BasePartOffset = Offsets[Value];
1193
1194 EVT ArgVT = ValueVTs[Value];
1195 EVT MemVT = ArgVT;
1196 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1197 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1198
1199 if (NumRegs == 1) {
1200 // This argument is not split, so the IR type is the memory type.
1201 if (ArgVT.isExtended()) {
1202 // We have an extended type, like i24, so we should just use the
1203 // register type.
1204 MemVT = RegisterVT;
1205 } else {
1206 MemVT = ArgVT;
1207 }
1208 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1209 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1210 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1211 // We have a vector value which has been split into a vector with
1212 // the same scalar type, but fewer elements. This should handle
1213 // all the floating-point vector types.
1214 MemVT = RegisterVT;
1215 } else if (ArgVT.isVector() &&
1216 ArgVT.getVectorNumElements() == NumRegs) {
1217 // This arg has been split so that each element is stored in a separate
1218 // register.
1219 MemVT = ArgVT.getScalarType();
1220 } else if (ArgVT.isExtended()) {
1221 // We have an extended type, like i65.
1222 MemVT = RegisterVT;
1223 } else {
1224 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1225 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1226 if (RegisterVT.isInteger()) {
1227 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1228 } else if (RegisterVT.isVector()) {
1229 assert(!RegisterVT.getScalarType().isFloatingPoint());
1230 unsigned NumElements = RegisterVT.getVectorNumElements();
1231 assert(MemoryBits % NumElements == 0);
1232 // This vector type has been split into another vector type with
1233 // a different elements size.
1234 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1235 MemoryBits / NumElements);
1236 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1237 } else {
1238 llvm_unreachable("cannot deduce memory type.");
1239 }
1240 }
1241
1242 // Convert one element vectors to scalar.
1243 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1244 MemVT = MemVT.getScalarType();
1245
1246 // Round up vec3/vec5 argument.
1247 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1248 assert(MemVT.getVectorNumElements() == 3 ||
1249 MemVT.getVectorNumElements() == 5 ||
1250 (MemVT.getVectorNumElements() >= 9 &&
1251 MemVT.getVectorNumElements() <= 12));
1252 MemVT = MemVT.getPow2VectorType(State.getContext());
1253 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1254 MemVT = MemVT.getRoundIntegerType(State.getContext());
1255 }
1256
1257 unsigned PartOffset = 0;
1258 for (unsigned i = 0; i != NumRegs; ++i) {
1259 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1260 BasePartOffset + PartOffset,
1261 MemVT.getSimpleVT(),
1263 PartOffset += MemVT.getStoreSize();
1264 }
1265 }
1266 }
1267}
1268
1270 SDValue Chain, CallingConv::ID CallConv,
1271 bool isVarArg,
1273 const SmallVectorImpl<SDValue> &OutVals,
1274 const SDLoc &DL, SelectionDAG &DAG) const {
1275 // FIXME: Fails for r600 tests
1276 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1277 // "wave terminate should not have return values");
1278 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1279}
1280
1281//===---------------------------------------------------------------------===//
1282// Target specific lowering
1283//===---------------------------------------------------------------------===//
1284
1285/// Selects the correct CCAssignFn for a given CallingConvention value.
1287 bool IsVarArg) {
1288 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1289}
1290
1292 bool IsVarArg) {
1294}
1295
1297 SelectionDAG &DAG,
1298 MachineFrameInfo &MFI,
1299 int ClobberedFI) const {
1300 SmallVector<SDValue, 8> ArgChains;
1301 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1302 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1303
1304 // Include the original chain at the beginning of the list. When this is
1305 // used by target LowerCall hooks, this helps legalize find the
1306 // CALLSEQ_BEGIN node.
1307 ArgChains.push_back(Chain);
1308
1309 // Add a chain value for each stack argument corresponding
1310 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1311 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1312 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1313 if (FI->getIndex() < 0) {
1314 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1315 int64_t InLastByte = InFirstByte;
1316 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1317
1318 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1319 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1320 ArgChains.push_back(SDValue(L, 1));
1321 }
1322 }
1323 }
1324 }
1325
1326 // Build a tokenfactor for all the chains.
1327 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1328}
1329
1332 StringRef Reason) const {
1333 SDValue Callee = CLI.Callee;
1334 SelectionDAG &DAG = CLI.DAG;
1335
1336 const Function &Fn = DAG.getMachineFunction().getFunction();
1337
1338 StringRef FuncName("<unknown>");
1339
1340 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1341 FuncName = G->getSymbol();
1342 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1343 FuncName = G->getGlobal()->getName();
1344
1346 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1347 DAG.getContext()->diagnose(NoCalls);
1348
1349 if (!CLI.IsTailCall) {
1350 for (ISD::InputArg &Arg : CLI.Ins)
1351 InVals.push_back(DAG.getUNDEF(Arg.VT));
1352 }
1353
1354 return DAG.getEntryNode();
1355}
1356
1358 SmallVectorImpl<SDValue> &InVals) const {
1359 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1360}
1361
1363 SelectionDAG &DAG) const {
1364 const Function &Fn = DAG.getMachineFunction().getFunction();
1365
1366 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1367 SDLoc(Op).getDebugLoc());
1368 DAG.getContext()->diagnose(NoDynamicAlloca);
1369 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1370 return DAG.getMergeValues(Ops, SDLoc());
1371}
1372
1374 SelectionDAG &DAG) const {
1375 switch (Op.getOpcode()) {
1376 default:
1377 Op->print(errs(), &DAG);
1378 llvm_unreachable("Custom lowering code for this "
1379 "instruction is not implemented yet!");
1380 break;
1382 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1384 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1385 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1386 case ISD::FREM: return LowerFREM(Op, DAG);
1387 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1388 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1389 case ISD::FRINT: return LowerFRINT(Op, DAG);
1390 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1391 case ISD::FROUNDEVEN:
1392 return LowerFROUNDEVEN(Op, DAG);
1393 case ISD::FROUND: return LowerFROUND(Op, DAG);
1394 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1395 case ISD::FLOG2:
1396 return LowerFLOG2(Op, DAG);
1397 case ISD::FLOG:
1398 case ISD::FLOG10:
1399 return LowerFLOGCommon(Op, DAG);
1400 case ISD::FEXP:
1401 case ISD::FEXP10:
1402 return lowerFEXP(Op, DAG);
1403 case ISD::FEXP2:
1404 return lowerFEXP2(Op, DAG);
1405 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1406 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1407 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1408 case ISD::FP_TO_SINT:
1409 case ISD::FP_TO_UINT:
1410 return LowerFP_TO_INT(Op, DAG);
1411 case ISD::CTTZ:
1413 case ISD::CTLZ:
1415 return LowerCTLZ_CTTZ(Op, DAG);
1417 }
1418 return Op;
1419}
1420
1423 SelectionDAG &DAG) const {
1424 switch (N->getOpcode()) {
1426 // Different parts of legalization seem to interpret which type of
1427 // sign_extend_inreg is the one to check for custom lowering. The extended
1428 // from type is what really matters, but some places check for custom
1429 // lowering of the result type. This results in trying to use
1430 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1431 // nothing here and let the illegal result integer be handled normally.
1432 return;
1433 case ISD::FLOG2:
1434 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1435 Results.push_back(Lowered);
1436 return;
1437 case ISD::FLOG:
1438 case ISD::FLOG10:
1439 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1440 Results.push_back(Lowered);
1441 return;
1442 case ISD::FEXP2:
1443 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1444 Results.push_back(Lowered);
1445 return;
1446 case ISD::FEXP:
1447 case ISD::FEXP10:
1448 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1449 Results.push_back(Lowered);
1450 return;
1451 case ISD::CTLZ:
1453 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1454 Results.push_back(Lowered);
1455 return;
1456 default:
1457 return;
1458 }
1459}
1460
1462 SDValue Op,
1463 SelectionDAG &DAG) const {
1464
1465 const DataLayout &DL = DAG.getDataLayout();
1466 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1467 const GlobalValue *GV = G->getGlobal();
1468
1469 if (!MFI->isModuleEntryFunction()) {
1470 if (std::optional<uint32_t> Address =
1472 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1473 }
1474 }
1475
1476 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1477 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1478 if (!MFI->isModuleEntryFunction() &&
1479 GV->getName() != "llvm.amdgcn.module.lds") {
1480 SDLoc DL(Op);
1481 const Function &Fn = DAG.getMachineFunction().getFunction();
1482 DiagnosticInfoUnsupported BadLDSDecl(
1483 Fn, "local memory global used by non-kernel function",
1484 DL.getDebugLoc(), DS_Warning);
1485 DAG.getContext()->diagnose(BadLDSDecl);
1486
1487 // We currently don't have a way to correctly allocate LDS objects that
1488 // aren't directly associated with a kernel. We do force inlining of
1489 // functions that use local objects. However, if these dead functions are
1490 // not eliminated, we don't want a compile time error. Just emit a warning
1491 // and a trap, since there should be no callable path here.
1492 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1493 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1494 Trap, DAG.getRoot());
1495 DAG.setRoot(OutputChain);
1496 return DAG.getUNDEF(Op.getValueType());
1497 }
1498
1499 // XXX: What does the value of G->getOffset() mean?
1500 assert(G->getOffset() == 0 &&
1501 "Do not know what to do with an non-zero offset");
1502
1503 // TODO: We could emit code to handle the initialization somewhere.
1504 // We ignore the initializer for now and legalize it to allow selection.
1505 // The initializer will anyway get errored out during assembly emission.
1506 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1507 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1508 }
1509 return SDValue();
1510}
1511
1513 SelectionDAG &DAG) const {
1515 SDLoc SL(Op);
1516
1517 EVT VT = Op.getValueType();
1518 if (VT.getVectorElementType().getSizeInBits() < 32) {
1519 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1520 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1521 unsigned NewNumElt = OpBitSize / 32;
1522 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1524 MVT::i32, NewNumElt);
1525 for (const SDUse &U : Op->ops()) {
1526 SDValue In = U.get();
1527 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1528 if (NewNumElt > 1)
1529 DAG.ExtractVectorElements(NewIn, Args);
1530 else
1531 Args.push_back(NewIn);
1532 }
1533
1534 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1535 NewNumElt * Op.getNumOperands());
1536 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1537 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1538 }
1539 }
1540
1541 for (const SDUse &U : Op->ops())
1542 DAG.ExtractVectorElements(U.get(), Args);
1543
1544 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1545}
1546
1548 SelectionDAG &DAG) const {
1549 SDLoc SL(Op);
1551 unsigned Start = Op.getConstantOperandVal(1);
1552 EVT VT = Op.getValueType();
1553 EVT SrcVT = Op.getOperand(0).getValueType();
1554
1555 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1556 unsigned NumElt = VT.getVectorNumElements();
1557 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1558 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1559
1560 // Extract 32-bit registers at a time.
1561 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1562 EVT NewVT = NumElt == 2
1563 ? MVT::i32
1564 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1565 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1566
1567 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1568 if (NumElt == 2)
1569 Tmp = Args[0];
1570 else
1571 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1572
1573 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1574 }
1575
1576 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1578
1579 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1580}
1581
1582// TODO: Handle fabs too
1584 if (Val.getOpcode() == ISD::FNEG)
1585 return Val.getOperand(0);
1586
1587 return Val;
1588}
1589
1591 if (Val.getOpcode() == ISD::FNEG)
1592 Val = Val.getOperand(0);
1593 if (Val.getOpcode() == ISD::FABS)
1594 Val = Val.getOperand(0);
1595 if (Val.getOpcode() == ISD::FCOPYSIGN)
1596 Val = Val.getOperand(0);
1597 return Val;
1598}
1599
1601 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1602 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1603 SelectionDAG &DAG = DCI.DAG;
1604 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1605 switch (CCOpcode) {
1606 case ISD::SETOEQ:
1607 case ISD::SETONE:
1608 case ISD::SETUNE:
1609 case ISD::SETNE:
1610 case ISD::SETUEQ:
1611 case ISD::SETEQ:
1612 case ISD::SETFALSE:
1613 case ISD::SETFALSE2:
1614 case ISD::SETTRUE:
1615 case ISD::SETTRUE2:
1616 case ISD::SETUO:
1617 case ISD::SETO:
1618 break;
1619 case ISD::SETULE:
1620 case ISD::SETULT: {
1621 if (LHS == True)
1622 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1623 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1624 }
1625 case ISD::SETOLE:
1626 case ISD::SETOLT:
1627 case ISD::SETLE:
1628 case ISD::SETLT: {
1629 // Ordered. Assume ordered for undefined.
1630
1631 // Only do this after legalization to avoid interfering with other combines
1632 // which might occur.
1634 !DCI.isCalledByLegalizer())
1635 return SDValue();
1636
1637 // We need to permute the operands to get the correct NaN behavior. The
1638 // selected operand is the second one based on the failing compare with NaN,
1639 // so permute it based on the compare type the hardware uses.
1640 if (LHS == True)
1641 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1642 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1643 }
1644 case ISD::SETUGE:
1645 case ISD::SETUGT: {
1646 if (LHS == True)
1647 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1648 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1649 }
1650 case ISD::SETGT:
1651 case ISD::SETGE:
1652 case ISD::SETOGE:
1653 case ISD::SETOGT: {
1655 !DCI.isCalledByLegalizer())
1656 return SDValue();
1657
1658 if (LHS == True)
1659 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1660 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1661 }
1662 case ISD::SETCC_INVALID:
1663 llvm_unreachable("Invalid setcc condcode!");
1664 }
1665 return SDValue();
1666}
1667
1668/// Generate Min/Max node
1670 SDValue LHS, SDValue RHS,
1671 SDValue True, SDValue False,
1672 SDValue CC,
1673 DAGCombinerInfo &DCI) const {
1674 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1675 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1676
1677 SelectionDAG &DAG = DCI.DAG;
1678
1679 // If we can't directly match this, try to see if we can fold an fneg to
1680 // match.
1681
1682 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1683 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1684 SDValue NegTrue = peekFNeg(True);
1685
1686 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1687 // fmin/fmax.
1688 //
1689 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1690 // -> fneg (fmin_legacy lhs, K)
1691 //
1692 // TODO: Use getNegatedExpression
1693 if (LHS == NegTrue && CFalse && CRHS) {
1694 APFloat NegRHS = neg(CRHS->getValueAPF());
1695 if (NegRHS == CFalse->getValueAPF()) {
1696 SDValue Combined =
1697 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1698 if (Combined)
1699 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1700 return SDValue();
1701 }
1702 }
1703
1704 return SDValue();
1705}
1706
1707std::pair<SDValue, SDValue>
1709 SDLoc SL(Op);
1710
1711 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1712
1713 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1714 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1715
1716 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1717 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1718
1719 return std::pair(Lo, Hi);
1720}
1721
1723 SDLoc SL(Op);
1724
1725 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1726 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1727 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1728}
1729
1731 SDLoc SL(Op);
1732
1733 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1734 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1735 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1736}
1737
1738// Split a vector type into two parts. The first part is a power of two vector.
1739// The second part is whatever is left over, and is a scalar if it would
1740// otherwise be a 1-vector.
1741std::pair<EVT, EVT>
1743 EVT LoVT, HiVT;
1744 EVT EltVT = VT.getVectorElementType();
1745 unsigned NumElts = VT.getVectorNumElements();
1746 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1747 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1748 HiVT = NumElts - LoNumElts == 1
1749 ? EltVT
1750 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1751 return std::pair(LoVT, HiVT);
1752}
1753
1754// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1755// scalar.
1756std::pair<SDValue, SDValue>
1758 const EVT &LoVT, const EVT &HiVT,
1759 SelectionDAG &DAG) const {
1761 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1762 N.getValueType().getVectorNumElements() &&
1763 "More vector elements requested than available!");
1765 DAG.getVectorIdxConstant(0, DL));
1766 SDValue Hi = DAG.getNode(
1768 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1769 return std::pair(Lo, Hi);
1770}
1771
1773 SelectionDAG &DAG) const {
1774 LoadSDNode *Load = cast<LoadSDNode>(Op);
1775 EVT VT = Op.getValueType();
1776 SDLoc SL(Op);
1777
1778
1779 // If this is a 2 element vector, we really want to scalarize and not create
1780 // weird 1 element vectors.
1781 if (VT.getVectorNumElements() == 2) {
1782 SDValue Ops[2];
1783 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1784 return DAG.getMergeValues(Ops, SL);
1785 }
1786
1787 SDValue BasePtr = Load->getBasePtr();
1788 EVT MemVT = Load->getMemoryVT();
1789
1790 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1791
1792 EVT LoVT, HiVT;
1793 EVT LoMemVT, HiMemVT;
1794 SDValue Lo, Hi;
1795
1796 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1797 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1798 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1799
1800 unsigned Size = LoMemVT.getStoreSize();
1801 Align BaseAlign = Load->getAlign();
1802 Align HiAlign = commonAlignment(BaseAlign, Size);
1803
1804 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1805 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1806 BaseAlign, Load->getMemOperand()->getFlags());
1807 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1808 SDValue HiLoad =
1809 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1810 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1811 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1812
1813 SDValue Join;
1814 if (LoVT == HiVT) {
1815 // This is the case that the vector is power of two so was evenly split.
1816 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1817 } else {
1818 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1819 DAG.getVectorIdxConstant(0, SL));
1820 Join = DAG.getNode(
1822 VT, Join, HiLoad,
1824 }
1825
1826 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1827 LoLoad.getValue(1), HiLoad.getValue(1))};
1828
1829 return DAG.getMergeValues(Ops, SL);
1830}
1831
1833 SelectionDAG &DAG) const {
1834 LoadSDNode *Load = cast<LoadSDNode>(Op);
1835 EVT VT = Op.getValueType();
1836 SDValue BasePtr = Load->getBasePtr();
1837 EVT MemVT = Load->getMemoryVT();
1838 SDLoc SL(Op);
1839 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1840 Align BaseAlign = Load->getAlign();
1841 unsigned NumElements = MemVT.getVectorNumElements();
1842
1843 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1844 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1845 if (NumElements != 3 ||
1846 (BaseAlign < Align(8) &&
1847 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1848 return SplitVectorLoad(Op, DAG);
1849
1850 assert(NumElements == 3);
1851
1852 EVT WideVT =
1854 EVT WideMemVT =
1856 SDValue WideLoad = DAG.getExtLoad(
1857 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1858 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1859 return DAG.getMergeValues(
1860 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1861 DAG.getVectorIdxConstant(0, SL)),
1862 WideLoad.getValue(1)},
1863 SL);
1864}
1865
1867 SelectionDAG &DAG) const {
1868 StoreSDNode *Store = cast<StoreSDNode>(Op);
1869 SDValue Val = Store->getValue();
1870 EVT VT = Val.getValueType();
1871
1872 // If this is a 2 element vector, we really want to scalarize and not create
1873 // weird 1 element vectors.
1874 if (VT.getVectorNumElements() == 2)
1875 return scalarizeVectorStore(Store, DAG);
1876
1877 EVT MemVT = Store->getMemoryVT();
1878 SDValue Chain = Store->getChain();
1879 SDValue BasePtr = Store->getBasePtr();
1880 SDLoc SL(Op);
1881
1882 EVT LoVT, HiVT;
1883 EVT LoMemVT, HiMemVT;
1884 SDValue Lo, Hi;
1885
1886 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1887 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1888 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1889
1890 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1891
1892 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1893 Align BaseAlign = Store->getAlign();
1894 unsigned Size = LoMemVT.getStoreSize();
1895 Align HiAlign = commonAlignment(BaseAlign, Size);
1896
1897 SDValue LoStore =
1898 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1899 Store->getMemOperand()->getFlags());
1900 SDValue HiStore =
1901 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1902 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1903
1904 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1905}
1906
1907// This is a shortcut for integer division because we have fast i32<->f32
1908// conversions, and fast f32 reciprocal instructions. The fractional part of a
1909// float is enough to accurately represent up to a 24-bit signed integer.
1911 bool Sign) const {
1912 SDLoc DL(Op);
1913 EVT VT = Op.getValueType();
1914 SDValue LHS = Op.getOperand(0);
1915 SDValue RHS = Op.getOperand(1);
1916 MVT IntVT = MVT::i32;
1917 MVT FltVT = MVT::f32;
1918
1919 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1920 if (LHSSignBits < 9)
1921 return SDValue();
1922
1923 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1924 if (RHSSignBits < 9)
1925 return SDValue();
1926
1927 unsigned BitSize = VT.getSizeInBits();
1928 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1929 unsigned DivBits = BitSize - SignBits;
1930 if (Sign)
1931 ++DivBits;
1932
1935
1936 SDValue jq = DAG.getConstant(1, DL, IntVT);
1937
1938 if (Sign) {
1939 // char|short jq = ia ^ ib;
1940 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1941
1942 // jq = jq >> (bitsize - 2)
1943 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1944 DAG.getConstant(BitSize - 2, DL, VT));
1945
1946 // jq = jq | 0x1
1947 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1948 }
1949
1950 // int ia = (int)LHS;
1951 SDValue ia = LHS;
1952
1953 // int ib, (int)RHS;
1954 SDValue ib = RHS;
1955
1956 // float fa = (float)ia;
1957 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1958
1959 // float fb = (float)ib;
1960 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1961
1962 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1963 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1964
1965 // fq = trunc(fq);
1966 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1967
1968 // float fqneg = -fq;
1969 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1970
1972
1973 bool UseFmadFtz = false;
1974 if (Subtarget->isGCN()) {
1976 UseFmadFtz =
1978 }
1979
1980 // float fr = mad(fqneg, fb, fa);
1981 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1982 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1984 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1985
1986 // int iq = (int)fq;
1987 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1988
1989 // fr = fabs(fr);
1990 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1991
1992 // fb = fabs(fb);
1993 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1994
1995 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1996
1997 // int cv = fr >= fb;
1998 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1999
2000 // jq = (cv ? jq : 0);
2001 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2002
2003 // dst = iq + jq;
2004 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2005
2006 // Rem needs compensation, it's easier to recompute it
2007 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2008 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2009
2010 // Truncate to number of bits this divide really is.
2011 if (Sign) {
2012 SDValue InRegSize
2013 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2014 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2015 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2016 } else {
2017 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2018 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2019 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2020 }
2021
2022 return DAG.getMergeValues({ Div, Rem }, DL);
2023}
2024
2026 SelectionDAG &DAG,
2028 SDLoc DL(Op);
2029 EVT VT = Op.getValueType();
2030
2031 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2032
2033 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2034
2035 SDValue One = DAG.getConstant(1, DL, HalfVT);
2036 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2037
2038 //HiLo split
2039 SDValue LHS_Lo, LHS_Hi;
2040 SDValue LHS = Op.getOperand(0);
2041 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2042
2043 SDValue RHS_Lo, RHS_Hi;
2044 SDValue RHS = Op.getOperand(1);
2045 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2046
2047 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2049
2050 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2051 LHS_Lo, RHS_Lo);
2052
2053 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2054 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2055
2056 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2057 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2058 return;
2059 }
2060
2061 if (isTypeLegal(MVT::i64)) {
2062 // The algorithm here is based on ideas from "Software Integer Division",
2063 // Tom Rodeheffer, August 2008.
2064
2067
2068 // Compute denominator reciprocal.
2069 unsigned FMAD =
2070 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2073 : (unsigned)AMDGPUISD::FMAD_FTZ;
2074
2075 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2076 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2077 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2078 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2079 Cvt_Lo);
2080 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2081 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2082 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2083 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2084 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2085 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2086 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2087 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2088 Mul1);
2089 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2090 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2091 SDValue Rcp64 = DAG.getBitcast(VT,
2092 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2093
2094 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2095 SDValue One64 = DAG.getConstant(1, DL, VT);
2096 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2097 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2098
2099 // First round of UNR (Unsigned integer Newton-Raphson).
2100 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2101 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2102 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2103 SDValue Mulhi1_Lo, Mulhi1_Hi;
2104 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2105 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2106 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2107 Mulhi1_Lo, Zero1);
2108 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2109 Mulhi1_Hi, Add1_Lo.getValue(1));
2110 SDValue Add1 = DAG.getBitcast(VT,
2111 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2112
2113 // Second round of UNR.
2114 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2115 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2116 SDValue Mulhi2_Lo, Mulhi2_Hi;
2117 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2118 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2119 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2120 Mulhi2_Lo, Zero1);
2121 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2122 Mulhi2_Hi, Add2_Lo.getValue(1));
2123 SDValue Add2 = DAG.getBitcast(VT,
2124 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2125
2126 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2127
2128 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2129
2130 SDValue Mul3_Lo, Mul3_Hi;
2131 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2132 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2133 Mul3_Lo, Zero1);
2134 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2135 Mul3_Hi, Sub1_Lo.getValue(1));
2136 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2137 SDValue Sub1 = DAG.getBitcast(VT,
2138 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2139
2140 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2141 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2142 ISD::SETUGE);
2143 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2144 ISD::SETUGE);
2145 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2146
2147 // TODO: Here and below portions of the code can be enclosed into if/endif.
2148 // Currently control flow is unconditional and we have 4 selects after
2149 // potential endif to substitute PHIs.
2150
2151 // if C3 != 0 ...
2152 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2153 RHS_Lo, Zero1);
2154 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2155 RHS_Hi, Sub1_Lo.getValue(1));
2156 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2157 Zero, Sub2_Lo.getValue(1));
2158 SDValue Sub2 = DAG.getBitcast(VT,
2159 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2160
2161 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2162
2163 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2164 ISD::SETUGE);
2165 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2166 ISD::SETUGE);
2167 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2168
2169 // if (C6 != 0)
2170 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2171
2172 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2173 RHS_Lo, Zero1);
2174 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2175 RHS_Hi, Sub2_Lo.getValue(1));
2176 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2177 Zero, Sub3_Lo.getValue(1));
2178 SDValue Sub3 = DAG.getBitcast(VT,
2179 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2180
2181 // endif C6
2182 // endif C3
2183
2184 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2185 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2186
2187 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2188 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2189
2190 Results.push_back(Div);
2191 Results.push_back(Rem);
2192
2193 return;
2194 }
2195
2196 // r600 expandion.
2197 // Get Speculative values
2198 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2199 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2200
2201 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2202 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2203 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2204
2205 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2206 SDValue DIV_Lo = Zero;
2207
2208 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2209
2210 for (unsigned i = 0; i < halfBitWidth; ++i) {
2211 const unsigned bitPos = halfBitWidth - i - 1;
2212 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2213 // Get value of high bit
2214 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2215 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2216 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2217
2218 // Shift
2219 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2220 // Add LHS high bit
2221 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2222
2223 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2224 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2225
2226 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2227
2228 // Update REM
2229 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2230 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2231 }
2232
2233 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2234 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2235 Results.push_back(DIV);
2236 Results.push_back(REM);
2237}
2238
2240 SelectionDAG &DAG) const {
2241 SDLoc DL(Op);
2242 EVT VT = Op.getValueType();
2243
2244 if (VT == MVT::i64) {
2246 LowerUDIVREM64(Op, DAG, Results);
2247 return DAG.getMergeValues(Results, DL);
2248 }
2249
2250 if (VT == MVT::i32) {
2251 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2252 return Res;
2253 }
2254
2255 SDValue X = Op.getOperand(0);
2256 SDValue Y = Op.getOperand(1);
2257
2258 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2259 // algorithm used here.
2260
2261 // Initial estimate of inv(y).
2262 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2263
2264 // One round of UNR.
2265 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2266 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2267 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2268 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2269
2270 // Quotient/remainder estimate.
2271 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2272 SDValue R =
2273 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2274
2275 // First quotient/remainder refinement.
2276 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2277 SDValue One = DAG.getConstant(1, DL, VT);
2278 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2279 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2280 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2281 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2282 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2283
2284 // Second quotient/remainder refinement.
2285 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2286 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2287 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2288 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2289 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2290
2291 return DAG.getMergeValues({Q, R}, DL);
2292}
2293
2295 SelectionDAG &DAG) const {
2296 SDLoc DL(Op);
2297 EVT VT = Op.getValueType();
2298
2299 SDValue LHS = Op.getOperand(0);
2300 SDValue RHS = Op.getOperand(1);
2301
2302 SDValue Zero = DAG.getConstant(0, DL, VT);
2303 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2304
2305 if (VT == MVT::i32) {
2306 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2307 return Res;
2308 }
2309
2310 if (VT == MVT::i64 &&
2311 DAG.ComputeNumSignBits(LHS) > 32 &&
2312 DAG.ComputeNumSignBits(RHS) > 32) {
2313 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2314
2315 //HiLo split
2316 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2317 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2318 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2319 LHS_Lo, RHS_Lo);
2320 SDValue Res[2] = {
2321 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2322 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2323 };
2324 return DAG.getMergeValues(Res, DL);
2325 }
2326
2327 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2328 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2329 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2330 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2331
2332 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2333 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2334
2335 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2336 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2337
2338 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2339 SDValue Rem = Div.getValue(1);
2340
2341 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2342 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2343
2344 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2345 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2346
2347 SDValue Res[2] = {
2348 Div,
2349 Rem
2350 };
2351 return DAG.getMergeValues(Res, DL);
2352}
2353
2354// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2356 SDLoc SL(Op);
2357 EVT VT = Op.getValueType();
2358 auto Flags = Op->getFlags();
2359 SDValue X = Op.getOperand(0);
2360 SDValue Y = Op.getOperand(1);
2361
2362 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2363 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2364 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2365 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2366 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2367}
2368
2370 SDLoc SL(Op);
2371 SDValue Src = Op.getOperand(0);
2372
2373 // result = trunc(src)
2374 // if (src > 0.0 && src != result)
2375 // result += 1.0
2376
2377 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2378
2379 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2380 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2381
2382 EVT SetCCVT =
2383 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2384
2385 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2386 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2387 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2388
2389 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2390 // TODO: Should this propagate fast-math-flags?
2391 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2392}
2393
2395 SelectionDAG &DAG) {
2396 const unsigned FractBits = 52;
2397 const unsigned ExpBits = 11;
2398
2399 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2400 Hi,
2401 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2402 DAG.getConstant(ExpBits, SL, MVT::i32));
2403 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2404 DAG.getConstant(1023, SL, MVT::i32));
2405
2406 return Exp;
2407}
2408
2410 SDLoc SL(Op);
2411 SDValue Src = Op.getOperand(0);
2412
2413 assert(Op.getValueType() == MVT::f64);
2414
2415 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2416
2417 // Extract the upper half, since this is where we will find the sign and
2418 // exponent.
2419 SDValue Hi = getHiHalf64(Src, DAG);
2420
2421 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2422
2423 const unsigned FractBits = 52;
2424
2425 // Extract the sign bit.
2426 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2427 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2428
2429 // Extend back to 64-bits.
2430 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2431 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2432
2433 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2434 const SDValue FractMask
2435 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2436
2437 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2438 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2439 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2440
2441 EVT SetCCVT =
2442 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2443
2444 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2445
2446 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2447 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2448
2449 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2450 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2451
2452 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2453}
2454
2456 SelectionDAG &DAG) const {
2457 SDLoc SL(Op);
2458 SDValue Src = Op.getOperand(0);
2459
2460 assert(Op.getValueType() == MVT::f64);
2461
2462 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2463 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2464 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2465
2466 // TODO: Should this propagate fast-math-flags?
2467
2468 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2469 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2470
2471 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2472
2473 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2474 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2475
2476 EVT SetCCVT =
2477 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2478 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2479
2480 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2481}
2482
2484 SelectionDAG &DAG) const {
2485 // FNEARBYINT and FRINT are the same, except in their handling of FP
2486 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2487 // rint, so just treat them as equivalent.
2488 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2489 Op.getOperand(0));
2490}
2491
2493 auto VT = Op.getValueType();
2494 auto Arg = Op.getOperand(0u);
2495 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2496}
2497
2498// XXX - May require not supporting f32 denormals?
2499
2500// Don't handle v2f16. The extra instructions to scalarize and repack around the
2501// compare and vselect end up producing worse code than scalarizing the whole
2502// operation.
2504 SDLoc SL(Op);
2505 SDValue X = Op.getOperand(0);
2506 EVT VT = Op.getValueType();
2507
2508 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2509
2510 // TODO: Should this propagate fast-math-flags?
2511
2512 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2513
2514 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2515
2516 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2517 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2518
2519 EVT SetCCVT =
2520 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2521
2522 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2523 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2524 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2525
2526 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2527 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2528}
2529
2531 SDLoc SL(Op);
2532 SDValue Src = Op.getOperand(0);
2533
2534 // result = trunc(src);
2535 // if (src < 0.0 && src != result)
2536 // result += -1.0.
2537
2538 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2539
2540 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2541 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2542
2543 EVT SetCCVT =
2544 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2545
2546 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2547 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2548 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2549
2550 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2551 // TODO: Should this propagate fast-math-flags?
2552 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2553}
2554
2555/// Return true if it's known that \p Src can never be an f32 denormal value.
2557 switch (Src.getOpcode()) {
2558 case ISD::FP_EXTEND:
2559 return Src.getOperand(0).getValueType() == MVT::f16;
2560 case ISD::FP16_TO_FP:
2561 case ISD::FFREXP:
2562 return true;
2564 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2565 switch (IntrinsicID) {
2566 case Intrinsic::amdgcn_frexp_mant:
2567 return true;
2568 default:
2569 return false;
2570 }
2571 }
2572 default:
2573 return false;
2574 }
2575
2576 llvm_unreachable("covered opcode switch");
2577}
2578
2580 SDNodeFlags Flags) {
2581 if (Flags.hasApproximateFuncs())
2582 return true;
2583 auto &Options = DAG.getTarget().Options;
2584 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2585}
2586
2588 SDValue Src,
2589 SDNodeFlags Flags) {
2590 return !valueIsKnownNeverF32Denorm(Src) &&
2591 DAG.getMachineFunction()
2594}
2595
2597 SDValue Src,
2598 SDNodeFlags Flags) const {
2599 SDLoc SL(Src);
2600 EVT VT = Src.getValueType();
2602 SDValue SmallestNormal =
2603 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2604
2605 // Want to scale denormals up, but negatives and 0 work just as well on the
2606 // scaled path.
2607 SDValue IsLtSmallestNormal = DAG.getSetCC(
2608 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2609 SmallestNormal, ISD::SETOLT);
2610
2611 return IsLtSmallestNormal;
2612}
2613
2615 SDNodeFlags Flags) const {
2616 SDLoc SL(Src);
2617 EVT VT = Src.getValueType();
2619 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2620
2621 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2622 SDValue IsFinite = DAG.getSetCC(
2623 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2624 Inf, ISD::SETOLT);
2625 return IsFinite;
2626}
2627
2628/// If denormal handling is required return the scaled input to FLOG2, and the
2629/// check for denormal range. Otherwise, return null values.
2630std::pair<SDValue, SDValue>
2632 SDValue Src, SDNodeFlags Flags) const {
2633 if (!needsDenormHandlingF32(DAG, Src, Flags))
2634 return {};
2635
2636 MVT VT = MVT::f32;
2637 const fltSemantics &Semantics = APFloat::IEEEsingle();
2638 SDValue SmallestNormal =
2639 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2640
2641 SDValue IsLtSmallestNormal = DAG.getSetCC(
2642 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2643 SmallestNormal, ISD::SETOLT);
2644
2645 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2646 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2647 SDValue ScaleFactor =
2648 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2649
2650 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2651 return {ScaledInput, IsLtSmallestNormal};
2652}
2653
2655 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2656 // If we have to handle denormals, scale up the input and adjust the result.
2657
2658 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2659 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2660
2661 SDLoc SL(Op);
2662 EVT VT = Op.getValueType();
2663 SDValue Src = Op.getOperand(0);
2664 SDNodeFlags Flags = Op->getFlags();
2665
2666 if (VT == MVT::f16) {
2667 // Nothing in half is a denormal when promoted to f32.
2668 assert(!Subtarget->has16BitInsts());
2669 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2670 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2671 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2672 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2673 }
2674
2675 auto [ScaledInput, IsLtSmallestNormal] =
2676 getScaledLogInput(DAG, SL, Src, Flags);
2677 if (!ScaledInput)
2678 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2679
2680 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2681
2682 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2683 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2684 SDValue ResultOffset =
2685 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2686 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2687}
2688
2689static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2690 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2691 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2692 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2693}
2694
2696 SelectionDAG &DAG) const {
2697 SDValue X = Op.getOperand(0);
2698 EVT VT = Op.getValueType();
2699 SDNodeFlags Flags = Op->getFlags();
2700 SDLoc DL(Op);
2701
2702 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2703 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2704
2705 const auto &Options = getTargetMachine().Options;
2706 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2707 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2708
2709 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2710 // Log and multiply in f32 is good enough for f16.
2711 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2712 }
2713
2714 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2715 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2716 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2717 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2718 }
2719
2720 return Lowered;
2721 }
2722
2723 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2724 if (ScaledInput)
2725 X = ScaledInput;
2726
2727 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2728
2729 SDValue R;
2730 if (Subtarget->hasFastFMAF32()) {
2731 // c+cc are ln(2)/ln(10) to more than 49 bits
2732 const float c_log10 = 0x1.344134p-2f;
2733 const float cc_log10 = 0x1.09f79ep-26f;
2734
2735 // c + cc is ln(2) to more than 49 bits
2736 const float c_log = 0x1.62e42ep-1f;
2737 const float cc_log = 0x1.efa39ep-25f;
2738
2739 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2740 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2741
2742 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2743 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2744 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2745 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2746 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2747 } else {
2748 // ch+ct is ln(2)/ln(10) to more than 36 bits
2749 const float ch_log10 = 0x1.344000p-2f;
2750 const float ct_log10 = 0x1.3509f6p-18f;
2751
2752 // ch + ct is ln(2) to more than 36 bits
2753 const float ch_log = 0x1.62e000p-1f;
2754 const float ct_log = 0x1.0bfbe8p-15f;
2755
2756 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2757 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2758
2759 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2760 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2761 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2762 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2763 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2764
2765 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2766 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2767 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2768 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2769 }
2770
2771 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2772 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2773
2774 // TODO: Check if known finite from source value.
2775 if (!IsFiniteOnly) {
2776 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2777 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2778 }
2779
2780 if (IsScaled) {
2781 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2782 SDValue ShiftK =
2783 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2784 SDValue Shift =
2785 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2786 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2787 }
2788
2789 return R;
2790}
2791
2793 return LowerFLOGCommon(Op, DAG);
2794}
2795
2796// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2797// promote f16 operation.
2799 SelectionDAG &DAG, bool IsLog10,
2800 SDNodeFlags Flags) const {
2801 EVT VT = Src.getValueType();
2802 unsigned LogOp =
2803 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2804
2805 double Log2BaseInverted =
2807
2808 if (VT == MVT::f32) {
2809 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2810 if (ScaledInput) {
2811 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2812 SDValue ScaledResultOffset =
2813 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2814
2815 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2816
2817 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2818 ScaledResultOffset, Zero, Flags);
2819
2820 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2821
2822 if (Subtarget->hasFastFMAF32())
2823 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2824 Flags);
2825 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2826 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2827 }
2828 }
2829
2830 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2831 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2832
2833 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2834 Flags);
2835}
2836
2838 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2839 // If we have to handle denormals, scale up the input and adjust the result.
2840
2841 SDLoc SL(Op);
2842 EVT VT = Op.getValueType();
2843 SDValue Src = Op.getOperand(0);
2844 SDNodeFlags Flags = Op->getFlags();
2845
2846 if (VT == MVT::f16) {
2847 // Nothing in half is a denormal when promoted to f32.
2848 assert(!Subtarget->has16BitInsts());
2849 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2850 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2851 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2852 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2853 }
2854
2855 assert(VT == MVT::f32);
2856
2857 if (!needsDenormHandlingF32(DAG, Src, Flags))
2858 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2859
2860 // bool needs_scaling = x < -0x1.f80000p+6f;
2861 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2862
2863 // -nextafter(128.0, -1)
2864 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2865
2866 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2867
2868 SDValue NeedsScaling =
2869 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2870
2871 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2872 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2873
2874 SDValue AddOffset =
2875 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2876
2877 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2878 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2879
2880 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2881 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2882 SDValue ResultScale =
2883 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2884
2885 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2886}
2887
2889 SelectionDAG &DAG,
2890 SDNodeFlags Flags) const {
2891 EVT VT = X.getValueType();
2892 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2893
2894 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2895 // exp2(M_LOG2E_F * f);
2896 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2897 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2898 : (unsigned)ISD::FEXP2,
2899 SL, VT, Mul, Flags);
2900 }
2901
2902 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2903
2904 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2905 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2906
2907 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2908
2909 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2910
2911 SDValue AdjustedX =
2912 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2913
2914 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2915
2916 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2917
2918 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2919 SDValue AdjustedResult =
2920 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2921
2922 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2923 Flags);
2924}
2925
2926/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2927/// handled correctly.
2929 SelectionDAG &DAG,
2930 SDNodeFlags Flags) const {
2931 const EVT VT = X.getValueType();
2932 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2933
2934 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2935 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2936 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2937 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2938
2939 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2940 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2941 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2942 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2943 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2944 }
2945
2946 // bool s = x < -0x1.2f7030p+5f;
2947 // x += s ? 0x1.0p+5f : 0.0f;
2948 // exp10 = exp2(x * 0x1.a92000p+1f) *
2949 // exp2(x * 0x1.4f0978p-11f) *
2950 // (s ? 0x1.9f623ep-107f : 1.0f);
2951
2952 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2953
2954 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2955 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2956
2957 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2958 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2959 SDValue AdjustedX =
2960 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2961
2962 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2963 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2964
2965 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2966 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2967 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2968 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2969
2970 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2971
2972 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2973 SDValue AdjustedResult =
2974 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2975
2976 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2977 Flags);
2978}
2979
2981 EVT VT = Op.getValueType();
2982 SDLoc SL(Op);
2983 SDValue X = Op.getOperand(0);
2984 SDNodeFlags Flags = Op->getFlags();
2985 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2986
2987 if (VT.getScalarType() == MVT::f16) {
2988 // v_exp_f16 (fmul x, log2e)
2989 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2990 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2991
2992 if (VT.isVector())
2993 return SDValue();
2994
2995 // exp(f16 x) ->
2996 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2997
2998 // Nothing in half is a denormal when promoted to f32.
2999 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3000 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3001 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3002 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3003 }
3004
3005 assert(VT == MVT::f32);
3006
3007 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3008 // library behavior. Also, is known-not-daz source sufficient?
3009 if (allowApproxFunc(DAG, Flags)) {
3010 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3011 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3012 }
3013
3014 // Algorithm:
3015 //
3016 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3017 //
3018 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3019 // n = 64*m + j, 0 <= j < 64
3020 //
3021 // e^x = 2^((64*m + j + f)/64)
3022 // = (2^m) * (2^(j/64)) * 2^(f/64)
3023 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3024 //
3025 // f = x*(64/ln(2)) - n
3026 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3027 //
3028 // e^x = (2^m) * (2^(j/64)) * e^r
3029 //
3030 // (2^(j/64)) is precomputed
3031 //
3032 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3033 // e^r = 1 + q
3034 //
3035 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3036 //
3037 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3038 SDNodeFlags FlagsNoContract = Flags;
3039 FlagsNoContract.setAllowContract(false);
3040
3041 SDValue PH, PL;
3042 if (Subtarget->hasFastFMAF32()) {
3043 const float c_exp = numbers::log2ef;
3044 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3045 const float c_exp10 = 0x1.a934f0p+1f;
3046 const float cc_exp10 = 0x1.2f346ep-24f;
3047
3048 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3049 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3050
3051 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3052 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3053 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3054 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3055 } else {
3056 const float ch_exp = 0x1.714000p+0f;
3057 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3058
3059 const float ch_exp10 = 0x1.a92000p+1f;
3060 const float cl_exp10 = 0x1.4f0978p-11f;
3061
3062 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3063 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3064
3065 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3066 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3067 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3068 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3069 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3070
3071 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3072
3073 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3074 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3075 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3076 }
3077
3078 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3079
3080 // It is unsafe to contract this fsub into the PH multiply.
3081 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3082
3083 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3084 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3085 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3086
3087 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3088
3089 SDValue UnderflowCheckConst =
3090 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3091
3092 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3093 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3094 SDValue Underflow =
3095 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3096
3097 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3098 const auto &Options = getTargetMachine().Options;
3099
3100 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3101 SDValue OverflowCheckConst =
3102 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3103 SDValue Overflow =
3104 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3105 SDValue Inf =
3107 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3108 }
3109
3110 return R;
3111}
3112
3113static bool isCtlzOpc(unsigned Opc) {
3114 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3115}
3116
3117static bool isCttzOpc(unsigned Opc) {
3118 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3119}
3120
3122 SelectionDAG &DAG) const {
3123 auto SL = SDLoc(Op);
3124 auto Opc = Op.getOpcode();
3125 auto Arg = Op.getOperand(0u);
3126 auto ResultVT = Op.getValueType();
3127
3128 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3129 return {};
3130
3131 assert(isCtlzOpc(Opc));
3132 assert(ResultVT == Arg.getValueType());
3133
3134 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3135 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3136 SDValue NewOp;
3137
3138 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3139 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3140 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3141 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3142 } else {
3143 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3144 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3145 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3146 }
3147
3148 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3149}
3150
3152 SDLoc SL(Op);
3153 SDValue Src = Op.getOperand(0);
3154
3155 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3156 bool Ctlz = isCtlzOpc(Op.getOpcode());
3157 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3158
3159 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3160 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3161 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3162
3163 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3164 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3165 // (cttz hi:lo) -> (umin (ffbl src), 32)
3166 // (ctlz_zero_undef src) -> (ffbh src)
3167 // (cttz_zero_undef src) -> (ffbl src)
3168
3169 // 64-bit scalar version produce 32-bit result
3170 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3171 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3172 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3173 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3174 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3175 if (!ZeroUndef) {
3176 const SDValue ConstVal = DAG.getConstant(
3177 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3178 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3179 }
3180 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3181 }
3182
3183 SDValue Lo, Hi;
3184 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3185
3186 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3187 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3188
3189 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3190 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3191 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3192 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3193
3194 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3195 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3196 if (Ctlz)
3197 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3198 else
3199 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3200
3201 SDValue NewOpr;
3202 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3203 if (!ZeroUndef) {
3204 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3205 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3206 }
3207
3208 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3209}
3210
3212 bool Signed) const {
3213 // The regular method converting a 64-bit integer to float roughly consists of
3214 // 2 steps: normalization and rounding. In fact, after normalization, the
3215 // conversion from a 64-bit integer to a float is essentially the same as the
3216 // one from a 32-bit integer. The only difference is that it has more
3217 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3218 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3219 // converted into the correct float number. The basic steps for the unsigned
3220 // conversion are illustrated in the following pseudo code:
3221 //
3222 // f32 uitofp(i64 u) {
3223 // i32 hi, lo = split(u);
3224 // // Only count the leading zeros in hi as we have native support of the
3225 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3226 // // reduced to a 32-bit one automatically.
3227 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3228 // u <<= shamt;
3229 // hi, lo = split(u);
3230 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3231 // // convert it as a 32-bit integer and scale the result back.
3232 // return uitofp(hi) * 2^(32 - shamt);
3233 // }
3234 //
3235 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3236 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3237 // converted instead followed by negation based its sign bit.
3238
3239 SDLoc SL(Op);
3240 SDValue Src = Op.getOperand(0);
3241
3242 SDValue Lo, Hi;
3243 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3244 SDValue Sign;
3245 SDValue ShAmt;
3246 if (Signed && Subtarget->isGCN()) {
3247 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3248 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3249 // account. That is, the maximal shift is
3250 // - 32 if Lo and Hi have opposite signs;
3251 // - 33 if Lo and Hi have the same sign.
3252 //
3253 // Or, MaxShAmt = 33 + OppositeSign, where
3254 //
3255 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3256 // - -1 if Lo and Hi have opposite signs; and
3257 // - 0 otherwise.
3258 //
3259 // All in all, ShAmt is calculated as
3260 //
3261 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3262 //
3263 // or
3264 //
3265 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3266 //
3267 // to reduce the critical path.
3268 SDValue OppositeSign = DAG.getNode(
3269 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3270 DAG.getConstant(31, SL, MVT::i32));
3271 SDValue MaxShAmt =
3272 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3273 OppositeSign);
3274 // Count the leading sign bits.
3275 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3276 // Different from unsigned conversion, the shift should be one bit less to
3277 // preserve the sign bit.
3278 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3279 DAG.getConstant(1, SL, MVT::i32));
3280 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3281 } else {
3282 if (Signed) {
3283 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3284 // absolute value first.
3285 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3286 DAG.getConstant(63, SL, MVT::i64));
3287 SDValue Abs =
3288 DAG.getNode(ISD::XOR, SL, MVT::i64,
3289 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3290 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3291 }
3292 // Count the leading zeros.
3293 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3294 // The shift amount for signed integers is [0, 32].
3295 }
3296 // Normalize the given 64-bit integer.
3297 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3298 // Split it again.
3299 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3300 // Calculate the adjust bit for rounding.
3301 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3302 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3303 DAG.getConstant(1, SL, MVT::i32), Lo);
3304 // Get the 32-bit normalized integer.
3305 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3306 // Convert the normalized 32-bit integer into f32.
3307 unsigned Opc =
3308 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3309 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3310
3311 // Finally, need to scale back the converted floating number as the original
3312 // 64-bit integer is converted as a 32-bit one.
3313 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3314 ShAmt);
3315 // On GCN, use LDEXP directly.
3316 if (Subtarget->isGCN())
3317 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3318
3319 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3320 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3321 // exponent is enough to avoid overflowing into the sign bit.
3322 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3323 DAG.getConstant(23, SL, MVT::i32));
3324 SDValue IVal =
3325 DAG.getNode(ISD::ADD, SL, MVT::i32,
3326 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3327 if (Signed) {
3328 // Set the sign bit.
3329 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3330 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3331 DAG.getConstant(31, SL, MVT::i32));
3332 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3333 }
3334 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3335}
3336
3338 bool Signed) const {
3339 SDLoc SL(Op);
3340 SDValue Src = Op.getOperand(0);
3341
3342 SDValue Lo, Hi;
3343 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3344
3346 SL, MVT::f64, Hi);
3347
3348 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3349
3350 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3351 DAG.getConstant(32, SL, MVT::i32));
3352 // TODO: Should this propagate fast-math-flags?
3353 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3354}
3355
3357 SelectionDAG &DAG) const {
3358 // TODO: Factor out code common with LowerSINT_TO_FP.
3359 EVT DestVT = Op.getValueType();
3360 SDValue Src = Op.getOperand(0);
3361 EVT SrcVT = Src.getValueType();
3362
3363 if (SrcVT == MVT::i16) {
3364 if (DestVT == MVT::f16)
3365 return Op;
3366 SDLoc DL(Op);
3367
3368 // Promote src to i32
3369 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3370 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3371 }
3372
3373 if (DestVT == MVT::bf16) {
3374 SDLoc SL(Op);
3375 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3376 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3377 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3378 }
3379
3380 if (SrcVT != MVT::i64)
3381 return Op;
3382
3383 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3384 SDLoc DL(Op);
3385
3386 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3387 SDValue FPRoundFlag =
3388 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3389 SDValue FPRound =
3390 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3391
3392 return FPRound;
3393 }
3394
3395 if (DestVT == MVT::f32)
3396 return LowerINT_TO_FP32(Op, DAG, false);
3397
3398 assert(DestVT == MVT::f64);
3399 return LowerINT_TO_FP64(Op, DAG, false);
3400}
3401
3403 SelectionDAG &DAG) const {
3404 EVT DestVT = Op.getValueType();
3405
3406 SDValue Src = Op.getOperand(0);
3407 EVT SrcVT = Src.getValueType();
3408
3409 if (SrcVT == MVT::i16) {
3410 if (DestVT == MVT::f16)
3411 return Op;
3412
3413 SDLoc DL(Op);
3414 // Promote src to i32
3415 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3416 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3417 }
3418
3419 if (DestVT == MVT::bf16) {
3420 SDLoc SL(Op);
3421 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3422 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3423 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3424 }
3425
3426 if (SrcVT != MVT::i64)
3427 return Op;
3428
3429 // TODO: Factor out code common with LowerUINT_TO_FP.
3430
3431 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3432 SDLoc DL(Op);
3433 SDValue Src = Op.getOperand(0);
3434
3435 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3436 SDValue FPRoundFlag =
3437 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3438 SDValue FPRound =
3439 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3440
3441 return FPRound;
3442 }
3443
3444 if (DestVT == MVT::f32)
3445 return LowerINT_TO_FP32(Op, DAG, true);
3446
3447 assert(DestVT == MVT::f64);
3448 return LowerINT_TO_FP64(Op, DAG, true);
3449}
3450
3452 bool Signed) const {
3453 SDLoc SL(Op);
3454
3455 SDValue Src = Op.getOperand(0);
3456 EVT SrcVT = Src.getValueType();
3457
3458 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3459
3460 // The basic idea of converting a floating point number into a pair of 32-bit
3461 // integers is illustrated as follows:
3462 //
3463 // tf := trunc(val);
3464 // hif := floor(tf * 2^-32);
3465 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3466 // hi := fptoi(hif);
3467 // lo := fptoi(lof);
3468 //
3469 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3470 SDValue Sign;
3471 if (Signed && SrcVT == MVT::f32) {
3472 // However, a 32-bit floating point number has only 23 bits mantissa and
3473 // it's not enough to hold all the significant bits of `lof` if val is
3474 // negative. To avoid the loss of precision, We need to take the absolute
3475 // value after truncating and flip the result back based on the original
3476 // signedness.
3477 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3478 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3479 DAG.getConstant(31, SL, MVT::i32));
3480 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3481 }
3482
3483 SDValue K0, K1;
3484 if (SrcVT == MVT::f64) {
3485 K0 = DAG.getConstantFP(
3486 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3487 SrcVT);
3488 K1 = DAG.getConstantFP(
3489 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3490 SrcVT);
3491 } else {
3492 K0 = DAG.getConstantFP(
3493 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3494 K1 = DAG.getConstantFP(
3495 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3496 }
3497 // TODO: Should this propagate fast-math-flags?
3498 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3499
3500 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3501
3502 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3503
3504 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3506 SL, MVT::i32, FloorMul);
3507 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3508
3509 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3510 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3511
3512 if (Signed && SrcVT == MVT::f32) {
3513 assert(Sign);
3514 // Flip the result based on the signedness, which is either all 0s or 1s.
3515 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3516 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3517 // r := xor(r, sign) - sign;
3518 Result =
3519 DAG.getNode(ISD::SUB, SL, MVT::i64,
3520 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3521 }
3522
3523 return Result;
3524}
3525
3527 SDLoc DL(Op);
3528 SDValue N0 = Op.getOperand(0);
3529
3530 // Convert to target node to get known bits
3531 if (N0.getValueType() == MVT::f32)
3532 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3533
3534 if (getTargetMachine().Options.UnsafeFPMath) {
3535 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3536 return SDValue();
3537 }
3538
3539 assert(N0.getSimpleValueType() == MVT::f64);
3540
3541 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3542 const unsigned ExpMask = 0x7ff;
3543 const unsigned ExpBiasf64 = 1023;
3544 const unsigned ExpBiasf16 = 15;
3545 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3546 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3547 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3548 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3549 DAG.getConstant(32, DL, MVT::i64));
3550 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3551 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3552 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3553 DAG.getConstant(20, DL, MVT::i64));
3554 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3555 DAG.getConstant(ExpMask, DL, MVT::i32));
3556 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3557 // add the f16 bias (15) to get the biased exponent for the f16 format.
3558 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3559 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3560
3561 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3562 DAG.getConstant(8, DL, MVT::i32));
3563 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3564 DAG.getConstant(0xffe, DL, MVT::i32));
3565
3566 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3567 DAG.getConstant(0x1ff, DL, MVT::i32));
3568 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3569
3570 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3571 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3572
3573 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3574 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3575 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3576 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3577
3578 // N = M | (E << 12);
3579 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3580 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3581 DAG.getConstant(12, DL, MVT::i32)));
3582
3583 // B = clamp(1-E, 0, 13);
3584 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3585 One, E);
3586 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3587 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3588 DAG.getConstant(13, DL, MVT::i32));
3589
3590 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3591 DAG.getConstant(0x1000, DL, MVT::i32));
3592
3593 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3594 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3595 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3596 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3597
3598 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3599 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3600 DAG.getConstant(0x7, DL, MVT::i32));
3601 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3602 DAG.getConstant(2, DL, MVT::i32));
3603 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3604 One, Zero, ISD::SETEQ);
3605 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3606 One, Zero, ISD::SETGT);
3607 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3608 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3609
3610 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3611 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3612 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3613 I, V, ISD::SETEQ);
3614
3615 // Extract the sign bit.
3616 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3617 DAG.getConstant(16, DL, MVT::i32));
3618 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3619 DAG.getConstant(0x8000, DL, MVT::i32));
3620
3621 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3622 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3623}
3624
3626 SelectionDAG &DAG) const {
3627 SDValue Src = Op.getOperand(0);
3628 unsigned OpOpcode = Op.getOpcode();
3629 EVT SrcVT = Src.getValueType();
3630 EVT DestVT = Op.getValueType();
3631
3632 // Will be selected natively
3633 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3634 return Op;
3635
3636 if (SrcVT == MVT::bf16) {
3637 SDLoc DL(Op);
3638 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3639 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3640 }
3641
3642 // Promote i16 to i32
3643 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3644 SDLoc DL(Op);
3645
3646 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3647 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3648 }
3649
3650 if (DestVT != MVT::i64)
3651 return Op;
3652
3653 if (SrcVT == MVT::f16 ||
3654 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3655 SDLoc DL(Op);
3656
3657 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3658 unsigned Ext =
3660 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3661 }
3662
3663 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3664 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3665
3666 return SDValue();
3667}
3668
3670 SelectionDAG &DAG) const {
3671 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3672 MVT VT = Op.getSimpleValueType();
3673 MVT ScalarVT = VT.getScalarType();
3674
3675 assert(VT.isVector());
3676
3677 SDValue Src = Op.getOperand(0);
3678 SDLoc DL(Op);
3679
3680 // TODO: Don't scalarize on Evergreen?
3681 unsigned NElts = VT.getVectorNumElements();
3683 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3684
3685 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3686 for (unsigned I = 0; I < NElts; ++I)
3687 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3688
3689 return DAG.getBuildVector(VT, DL, Args);
3690}
3691
3692//===----------------------------------------------------------------------===//
3693// Custom DAG optimizations
3694//===----------------------------------------------------------------------===//
3695
3696static bool isU24(SDValue Op, SelectionDAG &DAG) {
3697 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3698}
3699
3700static bool isI24(SDValue Op, SelectionDAG &DAG) {
3701 EVT VT = Op.getValueType();
3702 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3703 // as unsigned 24-bit values.
3705}
3706
3709 SelectionDAG &DAG = DCI.DAG;
3710 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3711 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3712
3713 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3714 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3715 unsigned NewOpcode = Node24->getOpcode();
3716 if (IsIntrin) {
3717 unsigned IID = Node24->getConstantOperandVal(0);
3718 switch (IID) {
3719 case Intrinsic::amdgcn_mul_i24:
3720 NewOpcode = AMDGPUISD::MUL_I24;
3721 break;
3722 case Intrinsic::amdgcn_mul_u24:
3723 NewOpcode = AMDGPUISD::MUL_U24;
3724 break;
3725 case Intrinsic::amdgcn_mulhi_i24:
3726 NewOpcode = AMDGPUISD::MULHI_I24;
3727 break;
3728 case Intrinsic::amdgcn_mulhi_u24:
3729 NewOpcode = AMDGPUISD::MULHI_U24;
3730 break;
3731 default:
3732 llvm_unreachable("Expected 24-bit mul intrinsic");
3733 }
3734 }
3735
3736 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3737
3738 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3739 // the operands to have other uses, but will only perform simplifications that
3740 // involve bypassing some nodes for this user.
3741 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3742 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3743 if (DemandedLHS || DemandedRHS)
3744 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3745 DemandedLHS ? DemandedLHS : LHS,
3746 DemandedRHS ? DemandedRHS : RHS);
3747
3748 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3749 // operands if this node is the only user.
3750 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3751 return SDValue(Node24, 0);
3752 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3753 return SDValue(Node24, 0);
3754
3755 return SDValue();
3756}
3757
3758template <typename IntTy>
3760 uint32_t Width, const SDLoc &DL) {
3761 if (Width + Offset < 32) {
3762 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3763 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3764 return DAG.getConstant(Result, DL, MVT::i32);
3765 }
3766
3767 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3768}
3769
3770static bool hasVolatileUser(SDNode *Val) {
3771 for (SDNode *U : Val->uses()) {
3772 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3773 if (M->isVolatile())
3774 return true;
3775 }
3776 }
3777
3778 return false;
3779}
3780
3782 // i32 vectors are the canonical memory type.
3783 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3784 return false;
3785
3786 if (!VT.isByteSized())
3787 return false;
3788
3789 unsigned Size = VT.getStoreSize();
3790
3791 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3792 return false;
3793
3794 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3795 return false;
3796
3797 return true;
3798}
3799
3800// Replace load of an illegal type with a store of a bitcast to a friendlier
3801// type.
3803 DAGCombinerInfo &DCI) const {
3804 if (!DCI.isBeforeLegalize())
3805 return SDValue();
3806
3807 LoadSDNode *LN = cast<LoadSDNode>(N);
3808 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3809 return SDValue();
3810
3811 SDLoc SL(N);
3812 SelectionDAG &DAG = DCI.DAG;
3813 EVT VT = LN->getMemoryVT();
3814
3815 unsigned Size = VT.getStoreSize();
3816 Align Alignment = LN->getAlign();
3817 if (Alignment < Size && isTypeLegal(VT)) {
3818 unsigned IsFast;
3819 unsigned AS = LN->getAddressSpace();
3820
3821 // Expand unaligned loads earlier than legalization. Due to visitation order
3822 // problems during legalization, the emitted instructions to pack and unpack
3823 // the bytes again are not eliminated in the case of an unaligned copy.
3825 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3826 if (VT.isVector())
3827 return SplitVectorLoad(SDValue(LN, 0), DAG);
3828
3829 SDValue Ops[2];
3830 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3831
3832 return DAG.getMergeValues(Ops, SDLoc(N));
3833 }
3834
3835 if (!IsFast)
3836 return SDValue();
3837 }
3838
3839 if (!shouldCombineMemoryType(VT))
3840 return SDValue();
3841
3842 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3843
3844 SDValue NewLoad
3845 = DAG.getLoad(NewVT, SL, LN->getChain(),
3846 LN->getBasePtr(), LN->getMemOperand());
3847
3848 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3849 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3850 return SDValue(N, 0);
3851}
3852
3853// Replace store of an illegal type with a store of a bitcast to a friendlier
3854// type.
3856 DAGCombinerInfo &DCI) const {
3857 if (!DCI.isBeforeLegalize())
3858 return SDValue();
3859
3860 StoreSDNode *SN = cast<StoreSDNode>(N);
3861 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3862 return SDValue();
3863
3864 EVT VT = SN->getMemoryVT();
3865 unsigned Size = VT.getStoreSize();
3866
3867 SDLoc SL(N);
3868 SelectionDAG &DAG = DCI.DAG;
3869 Align Alignment = SN->getAlign();
3870 if (Alignment < Size && isTypeLegal(VT)) {
3871 unsigned IsFast;
3872 unsigned AS = SN->getAddressSpace();
3873
3874 // Expand unaligned stores earlier than legalization. Due to visitation
3875 // order problems during legalization, the emitted instructions to pack and
3876 // unpack the bytes again are not eliminated in the case of an unaligned
3877 // copy.
3879 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3880 if (VT.isVector())
3881 return SplitVectorStore(SDValue(SN, 0), DAG);
3882
3883 return expandUnalignedStore(SN, DAG);
3884 }
3885
3886 if (!IsFast)
3887 return SDValue();
3888 }
3889
3890 if (!shouldCombineMemoryType(VT))
3891 return SDValue();
3892
3893 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3894 SDValue Val = SN->getValue();
3895
3896 //DCI.AddToWorklist(Val.getNode());
3897
3898 bool OtherUses = !Val.hasOneUse();
3899 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3900 if (OtherUses) {
3901 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3902 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3903 }
3904
3905 return DAG.getStore(SN->getChain(), SL, CastVal,
3906 SN->getBasePtr(), SN->getMemOperand());
3907}
3908
3909// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3910// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3911// issues.
3913 DAGCombinerInfo &DCI) const {
3914 SelectionDAG &DAG = DCI.DAG;
3915 SDValue N0 = N->getOperand(0);
3916
3917 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3918 // (vt2 (truncate (assertzext vt0:x, vt1)))
3919 if (N0.getOpcode() == ISD::TRUNCATE) {
3920 SDValue N1 = N->getOperand(1);
3921 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3922 SDLoc SL(N);
3923
3924 SDValue Src = N0.getOperand(0);
3925 EVT SrcVT = Src.getValueType();
3926 if (SrcVT.bitsGE(ExtVT)) {
3927 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3928 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3929 }
3930 }
3931
3932 return SDValue();
3933}
3934
3936 SDNode *N, DAGCombinerInfo &DCI) const {
3937 unsigned IID = N->getConstantOperandVal(0);
3938 switch (IID) {
3939 case Intrinsic::amdgcn_mul_i24:
3940 case Intrinsic::amdgcn_mul_u24:
3941 case Intrinsic::amdgcn_mulhi_i24:
3942 case Intrinsic::amdgcn_mulhi_u24:
3943 return simplifyMul24(N, DCI);
3944 case Intrinsic::amdgcn_fract:
3945 case Intrinsic::amdgcn_rsq:
3946 case Intrinsic::amdgcn_rcp_legacy:
3947 case Intrinsic::amdgcn_rsq_legacy:
3948 case Intrinsic::amdgcn_rsq_clamp: {
3949 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3950 SDValue Src = N->getOperand(1);
3951 return Src.isUndef() ? Src : SDValue();
3952 }
3953 case Intrinsic::amdgcn_frexp_exp: {
3954 // frexp_exp (fneg x) -> frexp_exp x
3955 // frexp_exp (fabs x) -> frexp_exp x
3956 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3957 SDValue Src = N->getOperand(1);
3958 SDValue PeekSign = peekFPSignOps(Src);
3959 if (PeekSign == Src)
3960 return SDValue();
3961 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3962 0);
3963 }
3964 default:
3965 return SDValue();
3966 }
3967}
3968
3969/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3970/// binary operation \p Opc to it with the corresponding constant operands.
3972 DAGCombinerInfo &DCI, const SDLoc &SL,
3973 unsigned Opc, SDValue LHS,
3974 uint32_t ValLo, uint32_t ValHi) const {
3975 SelectionDAG &DAG = DCI.DAG;
3976 SDValue Lo, Hi;
3977 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3978
3979 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3980 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3981
3982 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3983 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3984
3985 // Re-visit the ands. It's possible we eliminated one of them and it could
3986 // simplify the vector.
3987 DCI.AddToWorklist(Lo.getNode());
3988 DCI.AddToWorklist(Hi.getNode());
3989
3990 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3991 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3992}
3993
3995 DAGCombinerInfo &DCI) const {
3996 EVT VT = N->getValueType(0);
3997
3998 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3999 if (!RHS)
4000 return SDValue();
4001
4002 SDValue LHS = N->getOperand(0);
4003 unsigned RHSVal = RHS->getZExtValue();
4004 if (!RHSVal)
4005 return LHS;
4006
4007 SDLoc SL(N);
4008 SelectionDAG &DAG = DCI.DAG;
4009
4010 switch (LHS->getOpcode()) {
4011 default:
4012 break;
4013 case ISD::ZERO_EXTEND:
4014 case ISD::SIGN_EXTEND:
4015 case ISD::ANY_EXTEND: {
4016 SDValue X = LHS->getOperand(0);
4017
4018 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4019 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4020 // Prefer build_vector as the canonical form if packed types are legal.
4021 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4022 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
4023 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
4024 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4025 }
4026
4027 // shl (ext x) => zext (shl x), if shift does not overflow int
4028 if (VT != MVT::i64)
4029 break;
4030 KnownBits Known = DAG.computeKnownBits(X);
4031 unsigned LZ = Known.countMinLeadingZeros();
4032 if (LZ < RHSVal)
4033 break;
4034 EVT XVT = X.getValueType();
4035 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
4036 return DAG.getZExtOrTrunc(Shl, SL, VT);
4037 }
4038 }
4039
4040 if (VT != MVT::i64)
4041 return SDValue();
4042
4043 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
4044
4045 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4046 // common case, splitting this into a move and a 32-bit shift is faster and
4047 // the same code size.
4048 if (RHSVal < 32)
4049 return SDValue();
4050
4051 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
4052
4053 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
4054 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
4055
4056 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4057
4058 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
4059 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4060}
4061
4063 DAGCombinerInfo &DCI) const {
4064 if (N->getValueType(0) != MVT::i64)
4065 return SDValue();
4066
4067 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4068 if (!RHS)
4069 return SDValue();
4070
4071 SelectionDAG &DAG = DCI.DAG;
4072 SDLoc SL(N);
4073 unsigned RHSVal = RHS->getZExtValue();
4074
4075 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
4076 if (RHSVal == 32) {
4077 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4078 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4079 DAG.getConstant(31, SL, MVT::i32));
4080
4081 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
4082 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4083 }
4084
4085 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
4086 if (RHSVal == 63) {
4087 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
4088 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
4089 DAG.getConstant(31, SL, MVT::i32));
4090 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
4091 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
4092 }
4093
4094 return SDValue();
4095}
4096
4098 DAGCombinerInfo &DCI) const {
4099 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
4100 if (!RHS)
4101 return SDValue();
4102
4103 EVT VT = N->getValueType(0);
4104 SDValue LHS = N->getOperand(0);
4105 unsigned ShiftAmt = RHS->getZExtValue();
4106 SelectionDAG &DAG = DCI.DAG;
4107 SDLoc SL(N);
4108
4109 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4110 // this improves the ability to match BFE patterns in isel.
4111 if (LHS.getOpcode() == ISD::AND) {
4112 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4113 unsigned MaskIdx, MaskLen;
4114 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4115 MaskIdx == ShiftAmt) {
4116 return DAG.getNode(
4117 ISD::AND, SL, VT,
4118 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
4119 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
4120 }
4121 }
4122 }
4123
4124 if (VT != MVT::i64)
4125 return SDValue();
4126
4127 if (ShiftAmt < 32)
4128 return SDValue();
4129
4130 // srl i64:x, C for C >= 32
4131 // =>
4132 // build_pair (srl hi_32(x), C - 32), 0
4133 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
4134
4135 SDValue Hi = getHiHalf64(LHS, DAG);
4136
4137 SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
4138 SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
4139
4140 SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
4141
4142 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
4143}
4144
4146 SDNode *N, DAGCombinerInfo &DCI) const {
4147 SDLoc SL(N);
4148 SelectionDAG &DAG = DCI.DAG;
4149 EVT VT = N->getValueType(0);
4150 SDValue Src = N->getOperand(0);
4151
4152 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4153 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4154 SDValue Vec = Src.getOperand(0);
4155 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4156 SDValue Elt0 = Vec.getOperand(0);
4157 EVT EltVT = Elt0.getValueType();
4158 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4159 if (EltVT.isFloatingPoint()) {
4160 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4161 EltVT.changeTypeToInteger(), Elt0);
4162 }
4163
4164 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4165 }
4166 }
4167 }
4168
4169 // Equivalent of above for accessing the high element of a vector as an
4170 // integer operation.
4171 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4172 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4173 if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
4174 if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
4175 SDValue BV = stripBitcast(Src.getOperand(0));
4176 if (BV.getOpcode() == ISD::BUILD_VECTOR &&
4177 BV.getValueType().getVectorNumElements() == 2) {
4178 SDValue SrcElt = BV.getOperand(1);
4179 EVT SrcEltVT = SrcElt.getValueType();
4180 if (SrcEltVT.isFloatingPoint()) {
4181 SrcElt = DAG.getNode(ISD::BITCAST, SL,
4182 SrcEltVT.changeTypeToInteger(), SrcElt);
4183 }
4184
4185 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4186 }
4187 }
4188 }
4189 }
4190
4191 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4192 //
4193 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4194 // i16 (trunc (srl (i32 (trunc x), K)))
4195 if (VT.getScalarSizeInBits() < 32) {
4196 EVT SrcVT = Src.getValueType();
4197 if (SrcVT.getScalarSizeInBits() > 32 &&
4198 (Src.getOpcode() == ISD::SRL ||
4199 Src.getOpcode() == ISD::SRA ||
4200 Src.getOpcode() == ISD::SHL)) {
4201 SDValue Amt = Src.getOperand(1);
4202 KnownBits Known = DAG.computeKnownBits(Amt);
4203
4204 // - For left shifts, do the transform as long as the shift
4205 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4206 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4207 // losing information stored in the high bits when truncating.
4208 const unsigned MaxCstSize =
4209 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4210 if (Known.getMaxValue().ule(MaxCstSize)) {
4211 EVT MidVT = VT.isVector() ?
4212 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4213 VT.getVectorNumElements()) : MVT::i32;
4214
4215 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4216 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4217 Src.getOperand(0));
4218 DCI.AddToWorklist(Trunc.getNode());
4219
4220 if (Amt.getValueType() != NewShiftVT) {
4221 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4222 DCI.AddToWorklist(Amt.getNode());
4223 }
4224
4225 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4226 Trunc, Amt);
4227 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4228 }
4229 }
4230 }
4231
4232 return SDValue();
4233}
4234
4235// We need to specifically handle i64 mul here to avoid unnecessary conversion
4236// instructions. If we only match on the legalized i64 mul expansion,
4237// SimplifyDemandedBits will be unable to remove them because there will be
4238// multiple uses due to the separate mul + mulh[su].
4239static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4240 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4241 if (Size <= 32) {
4242 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4243 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4244 }
4245
4246 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4247 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4248
4249 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4250 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4251
4252 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4253}
4254
4255/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4256/// return SDValue().
4257static SDValue getAddOneOp(const SDNode *V) {
4258 if (V->getOpcode() != ISD::ADD)
4259 return SDValue();
4260
4261 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4262}
4263
4265 DAGCombinerInfo &DCI) const {
4266 assert(N->getOpcode() == ISD::MUL);
4267 EVT VT = N->getValueType(0);
4268
4269 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4270 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4271 // unnecessarily). isDivergent() is used as an approximation of whether the
4272 // value is in an SGPR.
4273 if (!N->isDivergent())
4274 return SDValue();
4275
4276 unsigned Size = VT.getSizeInBits();
4277 if (VT.isVector() || Size > 64)
4278 return SDValue();
4279
4280 SelectionDAG &DAG = DCI.DAG;
4281 SDLoc DL(N);
4282
4283 SDValue N0 = N->getOperand(0);
4284 SDValue N1 = N->getOperand(1);
4285
4286 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4287 // matching.
4288
4289 // mul x, (add y, 1) -> add (mul x, y), x
4290 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4291 SDValue AddOp = getAddOneOp(V.getNode());
4292 if (!AddOp)
4293 return SDValue();
4294
4295 if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
4296 return U->getOpcode() == ISD::MUL;
4297 }))
4298 return AddOp;
4299
4300 return SDValue();
4301 };
4302
4303 // FIXME: The selection pattern is not properly checking for commuted
4304 // operands, so we have to place the mul in the LHS
4305 if (SDValue MulOper = IsFoldableAdd(N0)) {
4306 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4307 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4308 }
4309
4310 if (SDValue MulOper = IsFoldableAdd(N1)) {
4311 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4312 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4313 }
4314
4315 // There are i16 integer mul/mad.
4316 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4317 return SDValue();
4318
4319 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4320 // in the source into any_extends if the result of the mul is truncated. Since
4321 // we can assume the high bits are whatever we want, use the underlying value
4322 // to avoid the unknown high bits from interfering.
4323 if (N0.getOpcode() == ISD::ANY_EXTEND)
4324 N0 = N0.getOperand(0);
4325
4326 if (N1.getOpcode() == ISD::ANY_EXTEND)
4327 N1 = N1.getOperand(0);
4328
4329 SDValue Mul;
4330
4331 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4332 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4333 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4334 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4335 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4336 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4337 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4338 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4339 } else {
4340 return SDValue();
4341 }
4342
4343 // We need to use sext even for MUL_U24, because MUL_U24 is used
4344 // for signed multiply of 8 and 16-bit types.
4345 return DAG.getSExtOrTrunc(Mul, DL, VT);
4346}
4347
4348SDValue
4350 DAGCombinerInfo &DCI) const {
4351 if (N->getValueType(0) != MVT::i32)
4352 return SDValue();
4353
4354 SelectionDAG &DAG = DCI.DAG;
4355 SDLoc DL(N);
4356
4357 SDValue N0 = N->getOperand(0);
4358 SDValue N1 = N->getOperand(1);
4359
4360 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4361 // in the source into any_extends if the result of the mul is truncated. Since
4362 // we can assume the high bits are whatever we want, use the underlying value
4363 // to avoid the unknown high bits from interfering.
4364 if (N0.getOpcode() == ISD::ANY_EXTEND)
4365 N0 = N0.getOperand(0);
4366 if (N1.getOpcode() == ISD::ANY_EXTEND)
4367 N1 = N1.getOperand(0);
4368
4369 // Try to use two fast 24-bit multiplies (one for each half of the result)
4370 // instead of one slow extending multiply.
4371 unsigned LoOpcode, HiOpcode;
4372 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4373 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4374 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4375 LoOpcode = AMDGPUISD::MUL_U24;
4376 HiOpcode = AMDGPUISD::MULHI_U24;
4377 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4378 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4379 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4380 LoOpcode = AMDGPUISD::MUL_I24;
4381 HiOpcode = AMDGPUISD::MULHI_I24;
4382 } else {
4383 return SDValue();
4384 }
4385
4386 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4387 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4388 DCI.CombineTo(N, Lo, Hi);
4389 return SDValue(N, 0);
4390}
4391
4393 DAGCombinerInfo &DCI) const {
4394 EVT VT = N->getValueType(0);
4395
4396 if (!Subtarget->hasMulI24() || VT.isVector())
4397 return SDValue();
4398
4399 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4400 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4401 // unnecessarily). isDivergent() is used as an approximation of whether the
4402 // value is in an SGPR.
4403 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4404 // valu op anyway)
4405 if (Subtarget->hasSMulHi() && !N->isDivergent())
4406 return SDValue();
4407
4408 SelectionDAG &DAG = DCI.DAG;
4409 SDLoc DL(N);
4410
4411 SDValue N0 = N->getOperand(0);
4412 SDValue N1 = N->getOperand(1);
4413
4414 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4415 return SDValue();
4416
4417 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4418 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4419
4420 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4421 DCI.AddToWorklist(Mulhi.getNode());
4422 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4423}
4424
4426 DAGCombinerInfo &DCI) const {
4427 EVT VT = N->getValueType(0);
4428
4429 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4430 return SDValue();
4431
4432 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4433 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4434 // unnecessarily). isDivergent() is used as an approximation of whether the
4435 // value is in an SGPR.
4436 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4437 // valu op anyway)
4438 if (Subtarget->hasSMulHi() && !N->isDivergent())
4439 return SDValue();
4440
4441 SelectionDAG &DAG = DCI.DAG;
4442 SDLoc DL(N);
4443
4444 SDValue N0 = N->getOperand(0);
4445 SDValue N1 = N->getOperand(1);
4446
4447 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4448 return SDValue();
4449
4450 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4451 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4452
4453 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4454 DCI.AddToWorklist(Mulhi.getNode());
4455 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4456}
4457
4458SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4459 SDValue Op,
4460 const SDLoc &DL,
4461 unsigned Opc) const {
4462 EVT VT = Op.getValueType();
4463 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4464 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4465 LegalVT != MVT::i16))
4466 return SDValue();
4467
4468 if (VT != MVT::i32)
4469 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4470
4471 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4472 if (VT != MVT::i32)
4473 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4474
4475 return FFBX;
4476}
4477
4478// The native instructions return -1 on 0 input. Optimize out a select that
4479// produces -1 on 0.
4480//
4481// TODO: If zero is not undef, we could also do this if the output is compared
4482// against the bitwidth.
4483//
4484// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4486 SDValue LHS, SDValue RHS,
4487 DAGCombinerInfo &DCI) const {
4488 if (!isNullConstant(Cond.getOperand(1)))
4489 return SDValue();
4490
4491 SelectionDAG &DAG = DCI.DAG;
4492 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4493 SDValue CmpLHS = Cond.getOperand(0);
4494
4495 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4496 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4497 if (CCOpcode == ISD::SETEQ &&
4498 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4499 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4500 unsigned Opc =
4502 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4503 }
4504
4505 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4506 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4507 if (CCOpcode == ISD::SETNE &&
4508 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4509 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4510 unsigned Opc =
4512
4513 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4514 }
4515
4516 return SDValue();
4517}
4518
4520 unsigned Op,
4521 const SDLoc &SL,
4522 SDValue Cond,
4523 SDValue N1,
4524 SDValue N2) {
4525 SelectionDAG &DAG = DCI.DAG;
4526 EVT VT = N1.getValueType();
4527
4528 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4529 N1.getOperand(0), N2.getOperand(0));
4530 DCI.AddToWorklist(NewSelect.getNode());
4531 return DAG.getNode(Op, SL, VT, NewSelect);
4532}
4533
4534// Pull a free FP operation out of a select so it may fold into uses.
4535//
4536// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4537// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4538//
4539// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4540// select c, (fabs x), +k -> fabs (select c, x, k)
4541SDValue
4543 SDValue N) const {
4544 SelectionDAG &DAG = DCI.DAG;
4545 SDValue Cond = N.getOperand(0);
4546 SDValue LHS = N.getOperand(1);
4547 SDValue RHS = N.getOperand(2);
4548
4549 EVT VT = N.getValueType();
4550 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4551 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4553 return SDValue();
4554
4555 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4556 SDLoc(N), Cond, LHS, RHS);
4557 }
4558
4559 bool Inv = false;
4560 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4561 std::swap(LHS, RHS);
4562 Inv = true;
4563 }
4564
4565 // TODO: Support vector constants.
4566 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
4567 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4568 !selectSupportsSourceMods(N.getNode())) {
4569 SDLoc SL(N);
4570 // If one side is an fneg/fabs and the other is a constant, we can push the
4571 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4572 SDValue NewLHS = LHS.getOperand(0);
4573 SDValue NewRHS = RHS;
4574
4575 // Careful: if the neg can be folded up, don't try to pull it back down.
4576 bool ShouldFoldNeg = true;
4577
4578 if (NewLHS.hasOneUse()) {
4579 unsigned Opc = NewLHS.getOpcode();
4580 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4581 ShouldFoldNeg = false;
4582 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4583 ShouldFoldNeg = false;
4584 }
4585
4586 if (ShouldFoldNeg) {
4587 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4588 return SDValue();
4589
4590 // We're going to be forced to use a source modifier anyway, there's no
4591 // point to pulling the negate out unless we can get a size reduction by
4592 // negating the constant.
4593 //
4594 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4595 // about cheaper constants.
4596 if (NewLHS.getOpcode() == ISD::FABS &&
4598 return SDValue();
4599
4601 return SDValue();
4602
4603 if (LHS.getOpcode() == ISD::FNEG)
4604 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4605
4606 if (Inv)
4607 std::swap(NewLHS, NewRHS);
4608
4609 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4610 Cond, NewLHS, NewRHS);
4611 DCI.AddToWorklist(NewSelect.getNode());
4612 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4613 }
4614 }
4615
4616 return SDValue();
4617}
4618
4620 DAGCombinerInfo &DCI) const {
4621 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4622 return Folded;
4623
4624 SDValue Cond = N->getOperand(0);
4625 if (Cond.getOpcode() != ISD::SETCC)
4626 return SDValue();
4627
4628 EVT VT = N->getValueType(0);
4629 SDValue LHS = Cond.getOperand(0);
4630 SDValue RHS = Cond.getOperand(1);
4631 SDValue CC = Cond.getOperand(2);
4632
4633 SDValue True = N->getOperand(1);
4634 SDValue False = N->getOperand(2);
4635
4636 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4637 SelectionDAG &DAG = DCI.DAG;
4638 if (DAG.isConstantValueOfAnyType(True) &&
4639 !DAG.isConstantValueOfAnyType(False)) {
4640 // Swap cmp + select pair to move constant to false input.
4641 // This will allow using VOPC cndmasks more often.
4642 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4643
4644 SDLoc SL(N);
4645 ISD::CondCode NewCC =
4646 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4647
4648 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4649 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4650 }
4651
4652 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4654 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4655 // Revisit this node so we can catch min3/max3/med3 patterns.
4656 //DCI.AddToWorklist(MinMax.getNode());
4657 return MinMax;
4658 }
4659 }
4660
4661 // There's no reason to not do this if the condition has other uses.
4662 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4663}
4664
4665static bool isInv2Pi(const APFloat &APF) {
4666 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4667 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4668 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4669
4670 return APF.bitwiseIsEqual(KF16) ||
4671 APF.bitwiseIsEqual(KF32) ||
4672 APF.bitwiseIsEqual(KF64);
4673}
4674
4675// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4676// additional cost to negate them.
4679 if (C->isZero())
4680 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4681
4682 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4683 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4684
4686}
4687
4691 return false;
4692}
4693
4697 return false;
4698}
4699
4700static unsigned inverseMinMax(unsigned Opc) {
4701 switch (Opc) {
4702 case ISD::FMAXNUM:
4703 return ISD::FMINNUM;
4704 case ISD::FMINNUM:
4705 return ISD::FMAXNUM;
4706 case ISD::FMAXNUM_IEEE:
4707 return ISD::FMINNUM_IEEE;
4708 case ISD::FMINNUM_IEEE:
4709 return ISD::FMAXNUM_IEEE;
4710 case ISD::FMAXIMUM:
4711 return ISD::FMINIMUM;
4712 case ISD::FMINIMUM:
4713 return ISD::FMAXIMUM;
4718 default:
4719 llvm_unreachable("invalid min/max opcode");
4720 }
4721}
4722
4723/// \return true if it's profitable to try to push an fneg into its source
4724/// instruction.
4726 // If the input has multiple uses and we can either fold the negate down, or
4727 // the other uses cannot, give up. This both prevents unprofitable
4728 // transformations and infinite loops: we won't repeatedly try to fold around
4729 // a negate that has no 'good' form.
4730 if (N0.hasOneUse()) {
4731 // This may be able to fold into the source, but at a code size cost. Don't
4732 // fold if the fold into the user is free.
4733 if (allUsesHaveSourceMods(N, 0))
4734 return false;
4735 } else {
4736 if (fnegFoldsIntoOp(N0.getNode()) &&
4738 return false;
4739 }
4740
4741 return true;
4742}
4743
4745 DAGCombinerInfo &DCI) const {
4746 SelectionDAG &DAG = DCI.DAG;
4747 SDValue N0 = N->getOperand(0);
4748 EVT VT = N->getValueType(0);
4749
4750 unsigned Opc = N0.getOpcode();
4751
4752 if (!shouldFoldFNegIntoSrc(N, N0))
4753 return SDValue();
4754
4755 SDLoc SL(N);
4756 switch (Opc) {
4757 case ISD::FADD: {
4758 if (!mayIgnoreSignedZero(N0))
4759 return SDValue();
4760
4761 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
4762 SDValue LHS = N0.getOperand(0);
4763 SDValue RHS = N0.getOperand(1);
4764
4765 if (LHS.getOpcode() != ISD::FNEG)
4766 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4767 else
4768 LHS = LHS.getOperand(0);
4769
4770 if (RHS.getOpcode() != ISD::FNEG)
4771 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4772 else
4773 RHS = RHS.getOperand(0);
4774
4775 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
4776 if (Res.getOpcode() != ISD::FADD)
4777 return SDValue(); // Op got folded away.
4778 if (!N0.hasOneUse())
4779 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4780 return Res;
4781 }
4782 case ISD::FMUL:
4784 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
4785 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
4786 SDValue LHS = N0.getOperand(0);
4787 SDValue RHS = N0.getOperand(1);
4788
4789 if (LHS.getOpcode() == ISD::FNEG)
4790 LHS = LHS.getOperand(0);
4791 else if (RHS.getOpcode() == ISD::FNEG)
4792 RHS = RHS.getOperand(0);
4793 else
4794 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4795
4796 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
4797 if (Res.getOpcode() != Opc)
4798 return SDValue(); // Op got folded away.
4799 if (!N0.hasOneUse())
4800 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4801 return Res;
4802 }
4803 case ISD::FMA:
4804 case ISD::FMAD: {
4805 // TODO: handle llvm.amdgcn.fma.legacy
4806 if (!mayIgnoreSignedZero(N0))
4807 return SDValue();
4808
4809 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
4810 SDValue LHS = N0.getOperand(0);
4811 SDValue MHS = N0.getOperand(1);
4812 SDValue RHS = N0.getOperand(2);
4813
4814 if (LHS.getOpcode() == ISD::FNEG)
4815 LHS = LHS.getOperand(0);
4816 else if (MHS.getOpcode() == ISD::FNEG)
4817 MHS = MHS.getOperand(0);
4818 else
4819 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
4820
4821 if (RHS.getOpcode() != ISD::FNEG)
4822 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4823 else
4824 RHS = RHS.getOperand(0);
4825
4826 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
4827 if (Res.getOpcode() != Opc)
4828 return SDValue(); // Op got folded away.
4829 if (!N0.hasOneUse())
4830 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4831 return Res;
4832 }
4833 case ISD::FMAXNUM:
4834 case ISD::FMINNUM:
4835 case ISD::FMAXNUM_IEEE:
4836 case ISD::FMINNUM_IEEE:
4837 case ISD::FMINIMUM:
4838 case ISD::FMAXIMUM:
4841 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
4842 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
4843 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
4844 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
4845
4846 SDValue LHS = N0.getOperand(0);
4847 SDValue RHS = N0.getOperand(1);
4848
4849 // 0 doesn't have a negated inline immediate.
4850 // TODO: This constant check should be generalized to other operations.
4852 return SDValue();
4853
4854 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
4855 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4856 unsigned Opposite = inverseMinMax(Opc);
4857
4858 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
4859 if (Res.getOpcode() != Opposite)
4860 return SDValue(); // Op got folded away.
4861 if (!N0.hasOneUse())
4862 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
4863 return Res;
4864 }
4865 case AMDGPUISD::FMED3: {
4866 SDValue Ops[3];
4867 for (unsigned I = 0; I < 3; ++I)
4868 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
4869
4870 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
4871 if (Res.getOpcode() != AMDGPUISD::FMED3)
4872 return SDValue(); // Op got folded away.
4873
4874 if (!N0.hasOneUse()) {
4875 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
4876 DAG.ReplaceAllUsesWith(N0, Neg);
4877
4878 for (SDNode *U : Neg->uses())
4879 DCI.AddToWorklist(U);
4880 }
4881
4882 return Res;
4883 }
4884 case ISD::FP_EXTEND:
4885 case ISD::FTRUNC:
4886 case ISD::FRINT:
4887 case ISD::FNEARBYINT: // XXX - Should fround be handled?
4888 case ISD::FROUNDEVEN:
4889 case ISD::FSIN:
4890 case ISD::FCANONICALIZE:
4891 case AMDGPUISD::RCP:
4894 case AMDGPUISD::SIN_HW: {
4895 SDValue CvtSrc = N0.getOperand(0);
4896 if (CvtSrc.getOpcode() == ISD::FNEG) {
4897 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
4898 // (fneg (rcp (fneg x))) -> (rcp x)
4899 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
4900 }
4901
4902 if (!N0.hasOneUse())
4903 return SDValue();
4904
4905 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
4906 // (fneg (rcp x)) -> (rcp (fneg x))
4907 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4908 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
4909 }
4910 case ISD::FP_ROUND: {
4911 SDValue CvtSrc = N0.getOperand(0);
4912
4913 if (CvtSrc.getOpcode() == ISD::FNEG) {
4914 // (fneg (fp_round (fneg x))) -> (fp_round x)
4915 return DAG.getNode(ISD::FP_ROUND, SL, VT,
4916 CvtSrc.getOperand(0), N0.getOperand(1));
4917 }
4918
4919 if (!N0.hasOneUse())
4920 return SDValue();
4921
4922 // (fneg (fp_round x)) -> (fp_round (fneg x))
4923 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4924 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4925 }
4926 case ISD::FP16_TO_FP: {
4927 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4928 // f16, but legalization of f16 fneg ends up pulling it out of the source.
4929 // Put the fneg back as a legal source operation that can be matched later.
4930 SDLoc SL(N);
4931
4932 SDValue Src = N0.getOperand(0);
4933 EVT SrcVT = Src.getValueType();
4934
4935 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4936 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4937 DAG.getConstant(0x8000, SL, SrcVT));
4938 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4939 }
4940 case ISD::SELECT: {
4941 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
4942 // TODO: Invert conditions of foldFreeOpFromSelect
4943 return SDValue();
4944 }
4945 case ISD::BITCAST: {
4946 SDLoc SL(N);
4947 SDValue BCSrc = N0.getOperand(0);
4948 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4949 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4950 if (HighBits.getValueType().getSizeInBits() != 32 ||
4951 !fnegFoldsIntoOp(HighBits.getNode()))
4952 return SDValue();
4953
4954 // f64 fneg only really needs to operate on the high half of of the
4955 // register, so try to force it to an f32 operation to help make use of
4956 // source modifiers.
4957 //
4958 //
4959 // fneg (f64 (bitcast (build_vector x, y))) ->
4960 // f64 (bitcast (build_vector (bitcast i32:x to f32),
4961 // (fneg (bitcast i32:y to f32)))
4962
4963 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4964 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4965 SDValue CastBack =
4966 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4967
4968 SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4969 Ops.back() = CastBack;
4970 DCI.AddToWorklist(NegHi.getNode());
4971 SDValue Build =
4972 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4973 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4974
4975 if (!N0.hasOneUse())
4976 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4977 return Result;
4978 }
4979
4980 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
4981 BCSrc.hasOneUse()) {
4982 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
4983 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
4984
4985 // TODO: Cast back result for multiple uses is beneficial in some cases.
4986
4987 SDValue LHS =
4988 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
4989 SDValue RHS =
4990 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
4991
4992 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
4993 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
4994
4995 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
4996 NegRHS);
4997 }
4998
4999 return SDValue();
5000 }
5001 default:
5002 return SDValue();
5003 }
5004}
5005
5007 DAGCombinerInfo &DCI) const {
5008 SelectionDAG &DAG = DCI.DAG;
5009 SDValue N0 = N->getOperand(0);
5010
5011 if (!N0.hasOneUse())
5012 return SDValue();
5013
5014 switch (N0.getOpcode()) {
5015 case ISD::FP16_TO_FP: {
5016 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5017 SDLoc SL(N);
5018 SDValue Src = N0.getOperand(0);
5019 EVT SrcVT = Src.getValueType();
5020
5021 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5022 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5023 DAG.getConstant(0x7fff, SL, SrcVT));
5024 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5025 }
5026 default:
5027 return SDValue();
5028 }
5029}
5030
5032 DAGCombinerInfo &DCI) const {
5033 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5034 if (!CFP)
5035 return SDValue();
5036
5037 // XXX - Should this flush denormals?
5038 const APFloat &Val = CFP->getValueAPF();
5039 APFloat One(Val.getSemantics(), "1.0");
5040 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5041}
5042
5044 DAGCombinerInfo &DCI) const {
5045 SelectionDAG &DAG = DCI.DAG;
5046 SDLoc DL(N);
5047
5048 switch(N->getOpcode()) {
5049 default:
5050 break;
5051 case ISD::BITCAST: {
5052 EVT DestVT = N->getValueType(0);
5053
5054 // Push casts through vector builds. This helps avoid emitting a large
5055 // number of copies when materializing floating point vector constants.
5056 //
5057 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5058 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5059 if (DestVT.isVector()) {
5060 SDValue Src = N->getOperand(0);
5061 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5064 EVT SrcVT = Src.getValueType();
5065 unsigned NElts = DestVT.getVectorNumElements();
5066
5067 if (SrcVT.getVectorNumElements() == NElts) {
5068 EVT DestEltVT = DestVT.getVectorElementType();
5069
5070 SmallVector<SDValue, 8> CastedElts;
5071 SDLoc SL(N);
5072 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5073 SDValue Elt = Src.getOperand(I);
5074 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5075 }
5076
5077 return DAG.getBuildVector(DestVT, SL, CastedElts);
5078 }
5079 }
5080 }
5081
5082 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5083 break;
5084
5085 // Fold bitcasts of constants.
5086 //
5087 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5088 // TODO: Generalize and move to DAGCombiner
5089 SDValue Src = N->getOperand(0);
5090 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
5091 SDLoc SL(N);
5092 uint64_t CVal = C->getZExtValue();
5093 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5094 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5095 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5096 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5097 }
5098
5099 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
5100 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5101 SDLoc SL(N);
5102 uint64_t CVal = Val.getZExtValue();
5103 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5104 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5105 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5106
5107 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5108 }
5109
5110 break;
5111 }
5112 case ISD::SHL: {
5114 break;
5115
5116 return performShlCombine(N, DCI);
5117 }
5118 case ISD::SRL: {
5120 break;
5121
5122 return performSrlCombine(N, DCI);
5123 }
5124 case ISD::SRA: {
5126 break;
5127
5128 return performSraCombine(N, DCI);
5129 }
5130 case ISD::TRUNCATE:
5131 return performTruncateCombine(N, DCI);
5132 case ISD::MUL:
5133 return performMulCombine(N, DCI);
5134 case AMDGPUISD::MUL_U24:
5135 case AMDGPUISD::MUL_I24: {
5136 if (SDValue Simplified = simplifyMul24(N, DCI))
5137 return Simplified;
5138 break;
5139 }
5142 return simplifyMul24(N, DCI);
5143 case ISD::SMUL_LOHI:
5144 case ISD::UMUL_LOHI:
5145 return performMulLoHiCombine(N, DCI);
5146 case ISD::MULHS:
5147 return performMulhsCombine(N, DCI);
5148 case ISD::MULHU:
5149 return performMulhuCombine(N, DCI);
5150 case ISD::SELECT:
5151 return performSelectCombine(N, DCI);
5152 case ISD::FNEG:
5153 return performFNegCombine(N, DCI);
5154 case ISD::FABS:
5155 return performFAbsCombine(N, DCI);
5156 case AMDGPUISD::BFE_I32:
5157 case AMDGPUISD::BFE_U32: {
5158 assert(!N->getValueType(0).isVector() &&
5159 "Vector handling of BFE not implemented");
5160 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5161 if (!Width)
5162 break;
5163
5164 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5165 if (WidthVal == 0)
5166 return DAG.getConstant(0, DL, MVT::i32);
5167
5168 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
5169 if (!Offset)
5170 break;
5171
5172 SDValue BitsFrom = N->getOperand(0);
5173 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5174
5175 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5176
5177 if (OffsetVal == 0) {
5178 // This is already sign / zero extended, so try to fold away extra BFEs.
5179 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5180
5181 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5182 if (OpSignBits >= SignBits)
5183 return BitsFrom;
5184
5185 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5186 if (Signed) {
5187 // This is a sign_extend_inreg. Replace it to take advantage of existing
5188 // DAG Combines. If not eliminated, we will match back to BFE during
5189 // selection.
5190
5191 // TODO: The sext_inreg of extended types ends, although we can could
5192 // handle them in a single BFE.
5193 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5194 DAG.getValueType(SmallVT));
5195 }
5196
5197 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5198 }
5199
5200 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5201 if (Signed) {
5202 return constantFoldBFE<int32_t>(DAG,
5203 CVal->getSExtValue(),
5204 OffsetVal,
5205 WidthVal,
5206 DL);
5207 }
5208
5209 return constantFoldBFE<uint32_t>(DAG,
5210 CVal->getZExtValue(),
5211 OffsetVal,
5212 WidthVal,
5213 DL);
5214 }
5215
5216 if ((OffsetVal + WidthVal) >= 32 &&
5217 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5218 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5219 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5220 BitsFrom, ShiftVal);
5221 }
5222
5223 if (BitsFrom.hasOneUse()) {
5224 APInt Demanded = APInt::getBitsSet(32,
5225 OffsetVal,
5226 OffsetVal + WidthVal);
5227
5228 KnownBits Known;
5230 !DCI.isBeforeLegalizeOps());
5231 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5232 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5233 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5234 DCI.CommitTargetLoweringOpt(TLO);
5235 }
5236 }
5237
5238 break;
5239 }
5240 case ISD::LOAD:
5241 return performLoadCombine(N, DCI);
5242 case ISD::STORE:
5243 return performStoreCombine(N, DCI);
5244 case AMDGPUISD::RCP:
5246 return performRcpCombine(N, DCI);
5247 case ISD::AssertZext:
5248 case ISD::AssertSext:
5249 return performAssertSZExtCombine(N, DCI);
5251 return performIntrinsicWOChainCombine(N, DCI);
5252 case AMDGPUISD::FMAD_FTZ: {
5253 SDValue N0 = N->getOperand(0);
5254 SDValue N1 = N->getOperand(1);
5255 SDValue N2 = N->getOperand(2);
5256 EVT VT = N->getValueType(0);
5257
5258 // FMAD_FTZ is a FMAD + flush denormals to zero.
5259 // We flush the inputs, the intermediate step, and the output.
5260 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
5261 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
5262 ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
5263 if (N0CFP && N1CFP && N2CFP) {
5264 const auto FTZ = [](const APFloat &V) {
5265 if (V.isDenormal()) {
5266 APFloat Zero(V.getSemantics(), 0);
5267 return V.isNegative() ? -Zero : Zero;
5268 }
5269 return V;
5270 };
5271
5272 APFloat V0 = FTZ(N0CFP->getValueAPF());
5273 APFloat V1 = FTZ(N1CFP->getValueAPF());
5274 APFloat V2 = FTZ(N2CFP->getValueAPF());
5276 V0 = FTZ(V0);
5278 return DAG.getConstantFP(FTZ(V0), DL, VT);
5279 }
5280 break;
5281 }
5282 }
5283 return SDValue();
5284}
5285
5286//===----------------------------------------------------------------------===//
5287// Helper functions
5288//===----------------------------------------------------------------------===//
5289
5291 const TargetRegisterClass *RC,
5292 Register Reg, EVT VT,
5293 const SDLoc &SL,
5294 bool RawReg) const {
5297 Register VReg;
5298
5299 if (!MRI.isLiveIn(Reg)) {
5300 VReg = MRI.createVirtualRegister(RC);
5301 MRI.addLiveIn(Reg, VReg);
5302 } else {
5303 VReg = MRI.getLiveInVirtReg(Reg);
5304 }
5305
5306 if (RawReg)
5307 return DAG.getRegister(VReg, VT);
5308
5309 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5310}
5311
5312// This may be called multiple times, and nothing prevents creating multiple
5313// objects at the same offset. See if we already defined this object.
5315 int64_t Offset) {
5316 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5317 if (MFI.getObjectOffset(I) == Offset) {
5318 assert(MFI.getObjectSize(I) == Size);
5319 return I;
5320 }
5321 }
5322
5323 return MFI.CreateFixedObject(Size, Offset, true);
5324}
5325
5327 EVT VT,
5328 const SDLoc &SL,
5329 int64_t Offset) const {
5331 MachineFrameInfo &MFI = MF.getFrameInfo();
5332 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5333
5334 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5335 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5336
5337 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5340}
5341
5343 const SDLoc &SL,
5344 SDValue Chain,
5345 SDValue ArgVal,
5346 int64_t Offset) const {
5350
5351 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5352 // Stores to the argument stack area are relative to the stack pointer.
5353 SDValue SP =
5354 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5355 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5356 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5358 return Store;
5359}
5360
5362 const TargetRegisterClass *RC,
5363 EVT VT, const SDLoc &SL,
5364 const ArgDescriptor &Arg) const {
5365 assert(Arg && "Attempting to load missing argument");
5366
5367 SDValue V = Arg.isRegister() ?
5368 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5369 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5370
5371 if (!Arg.isMasked())
5372 return V;
5373
5374 unsigned Mask = Arg.getMask();
5375 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5376 V = DAG.getNode(ISD::SRL, SL, VT, V,
5377 DAG.getShiftAmountConstant(Shift, VT, SL));
5378 return DAG.getNode(ISD::AND, SL, VT, V,
5379 DAG.getConstant(Mask >> Shift, SL, VT));
5380}
5381
5383 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5384 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5385 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5386 uint64_t ArgOffset =
5387 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5388 switch (Param) {
5389 case FIRST_IMPLICIT:
5390 return ArgOffset;
5391 case PRIVATE_BASE:
5393 case SHARED_BASE:
5394 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5395 case QUEUE_PTR:
5396 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5397 }
5398 llvm_unreachable("unexpected implicit parameter type");
5399}
5400
5402 const MachineFunction &MF, const ImplicitParameter Param) const {
5405}
5406
5407#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5408
5409const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5410 switch ((AMDGPUISD::NodeType)Opcode) {
5411 case AMDGPUISD::FIRST_NUMBER: break;
5412 // AMDIL DAG nodes
5413 NODE_NAME_CASE(UMUL);
5414 NODE_NAME_CASE(BRANCH_COND);
5415
5416 // AMDGPU DAG nodes
5417 NODE_NAME_CASE(IF)
5418 NODE_NAME_CASE(ELSE)
5419 NODE_NAME_CASE(LOOP)
5420 NODE_NAME_CASE(CALL)
5421 NODE_NAME_CASE(TC_RETURN)
5422 NODE_NAME_CASE(TC_RETURN_GFX)
5423 NODE_NAME_CASE(TC_RETURN_CHAIN)
5424 NODE_NAME_CASE(TRAP)
5425 NODE_NAME_CASE(RET_GLUE)
5426 NODE_NAME_CASE(WAVE_ADDRESS)
5427 NODE_NAME_CASE(RETURN_TO_EPILOG)
5428 NODE_NAME_CASE(ENDPGM)
5429 NODE_NAME_CASE(ENDPGM_TRAP)
5430 NODE_NAME_CASE(SIMULATED_TRAP)
5431 NODE_NAME_CASE(DWORDADDR)
5432 NODE_NAME_CASE(FRACT)
5433 NODE_NAME_CASE(SETCC)
5434 NODE_NAME_CASE(SETREG)
5435 NODE_NAME_CASE(DENORM_MODE)
5436 NODE_NAME_CASE(FMA_W_CHAIN)
5437 NODE_NAME_CASE(FMUL_W_CHAIN)
5438 NODE_NAME_CASE(CLAMP)
5439 NODE_NAME_CASE(COS_HW)
5440 NODE_NAME_CASE(SIN_HW)
5441 NODE_NAME_CASE(FMAX_LEGACY)
5442 NODE_NAME_CASE(FMIN_LEGACY)
5443 NODE_NAME_CASE(FMAX3)
5444 NODE_NAME_CASE(SMAX3)
5445 NODE_NAME_CASE(UMAX3)
5446 NODE_NAME_CASE(FMIN3)
5447 NODE_NAME_CASE(SMIN3)
5448 NODE_NAME_CASE(UMIN3)
5449 NODE_NAME_CASE(FMED3)
5450 NODE_NAME_CASE(SMED3)
5451 NODE_NAME_CASE(UMED3)
5452 NODE_NAME_CASE(FMAXIMUM3)
5453 NODE_NAME_CASE(FMINIMUM3)
5454 NODE_NAME_CASE(FDOT2)
5455 NODE_NAME_CASE(URECIP)
5456 NODE_NAME_CASE(DIV_SCALE)
5457 NODE_NAME_CASE(DIV_FMAS)
5458 NODE_NAME_CASE(DIV_FIXUP)
5459 NODE_NAME_CASE(FMAD_FTZ)
5460 NODE_NAME_CASE(RCP)
5461 NODE_NAME_CASE(RSQ)
5462 NODE_NAME_CASE(RCP_LEGACY)
5463 NODE_NAME_CASE(RCP_IFLAG)
5464 NODE_NAME_CASE(LOG)
5465 NODE_NAME_CASE(EXP)
5466 NODE_NAME_CASE(FMUL_LEGACY)
5467 NODE_NAME_CASE(RSQ_CLAMP)
5468 NODE_NAME_CASE(FP_CLASS)
5469 NODE_NAME_CASE(DOT4)
5470 NODE_NAME_CASE(CARRY)
5471 NODE_NAME_CASE(BORROW)
5472 NODE_NAME_CASE(BFE_U32)
5473 NODE_NAME_CASE(BFE_I32)
5474 NODE_NAME_CASE(BFI)
5475 NODE_NAME_CASE(BFM)
5476 NODE_NAME_CASE(FFBH_U32)
5477 NODE_NAME_CASE(FFBH_I32)
5478 NODE_NAME_CASE(FFBL_B32)
5479 NODE_NAME_CASE(MUL_U24)
5480 NODE_NAME_CASE(MUL_I24)
5481 NODE_NAME_CASE(MULHI_U24)
5482 NODE_NAME_CASE(MULHI_I24)
5483 NODE_NAME_CASE(MAD_U24)
5484 NODE_NAME_CASE(MAD_I24)
5485 NODE_NAME_CASE(MAD_I64_I32)
5486 NODE_NAME_CASE(MAD_U64_U32)
5487 NODE_NAME_CASE(PERM)
5488 NODE_NAME_CASE(TEXTURE_FETCH)
5489 NODE_NAME_CASE(R600_EXPORT)
5490 NODE_NAME_CASE(CONST_ADDRESS)
5491 NODE_NAME_CASE(REGISTER_LOAD)
5492 NODE_NAME_CASE(REGISTER_STORE)
5493 NODE_NAME_CASE(SAMPLE)
5494 NODE_NAME_CASE(SAMPLEB)
5495 NODE_NAME_CASE(SAMPLED)
5496 NODE_NAME_CASE(SAMPLEL)
5497 NODE_NAME_CASE(CVT_F32_UBYTE0)
5498 NODE_NAME_CASE(CVT_F32_UBYTE1)
5499 NODE_NAME_CASE(CVT_F32_UBYTE2)
5500 NODE_NAME_CASE(CVT_F32_UBYTE3)
5501 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5502 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5503 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5504 NODE_NAME_CASE(CVT_PK_I16_I32)
5505 NODE_NAME_CASE(CVT_PK_U16_U32)
5506 NODE_NAME_CASE(FP_TO_FP16)
5507 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5508 NODE_NAME_CASE(CONST_DATA_PTR)
5509 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5511 NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
5512 NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5513 NODE_NAME_CASE(DUMMY_CHAIN)
5515 NODE_NAME_CASE(LOAD_D16_HI)
5516 NODE_NAME_CASE(LOAD_D16_LO)
5517 NODE_NAME_CASE(LOAD_D16_HI_I8)
5518 NODE_NAME_CASE(LOAD_D16_HI_U8)
5519 NODE_NAME_CASE(LOAD_D16_LO_I8)
5520 NODE_NAME_CASE(LOAD_D16_LO_U8)
5521 NODE_NAME_CASE(STORE_MSKOR)
5522 NODE_NAME_CASE(LOAD_CONSTANT)
5523 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5524 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5525 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5526 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5527 NODE_NAME_CASE(DS_ORDERED_COUNT)
5528 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5529 NODE_NAME_CASE(BUFFER_LOAD)
5530 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5531 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5532 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5533 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5534 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5535 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5536 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5537 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5538 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5539 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5540 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5541 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5542 NODE_NAME_CASE(SBUFFER_LOAD)
5543 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5544 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5545 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5546 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5547 NODE_NAME_CASE(BUFFER_STORE)
5548 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5549 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5550 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5551 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5552 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5553 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5554 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5555 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5556 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5557 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5558 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5559 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5560 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5561 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5562 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5563 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5564 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5565 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5566 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5567 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5568 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5569 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5570
5572 }
5573 return nullptr;
5574}
5575
5577 SelectionDAG &DAG, int Enabled,
5578 int &RefinementSteps,
5579 bool &UseOneConstNR,
5580 bool Reciprocal) const {
5581 EVT VT = Operand.getValueType();
5582
5583 if (VT == MVT::f32) {
5584 RefinementSteps = 0;
5585 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5586 }
5587
5588 // TODO: There is also f64 rsq instruction, but the documentation is less
5589 // clear on its precision.
5590
5591 return SDValue();
5592}
5593
5595 SelectionDAG &DAG, int Enabled,
5596 int &RefinementSteps) const {
5597 EVT VT = Operand.getValueType();
5598
5599 if (VT == MVT::f32) {
5600 // Reciprocal, < 1 ulp error.
5601 //
5602 // This reciprocal approximation converges to < 0.5 ulp error with one
5603 // newton rhapson performed with two fused multiple adds (FMAs).
5604
5605 RefinementSteps = 0;
5606 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5607 }
5608
5609 // TODO: There is also f64 rcp instruction, but the documentation is less
5610 // clear on its precision.
5611
5612 return SDValue();
5613}
5614
5615static unsigned workitemIntrinsicDim(unsigned ID) {
5616 switch (ID) {
5617 case Intrinsic::amdgcn_workitem_id_x:
5618 return 0;
5619 case Intrinsic::amdgcn_workitem_id_y:
5620 return 1;
5621 case Intrinsic::amdgcn_workitem_id_z:
5622 return 2;
5623 default:
5624 llvm_unreachable("not a workitem intrinsic");
5625 }
5626}
5627
5629 const SDValue Op, KnownBits &Known,
5630 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5631
5632 Known.resetAll(); // Don't know anything.
5633
5634 unsigned Opc = Op.getOpcode();
5635
5636 switch (Opc) {
5637 default:
5638 break;
5639 case AMDGPUISD::CARRY:
5640 case AMDGPUISD::BORROW: {
5641 Known.Zero = APInt::getHighBitsSet(32, 31);
5642 break;
5643 }
5644
5645 case AMDGPUISD::BFE_I32:
5646 case AMDGPUISD::BFE_U32: {
5647 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5648 if (!CWidth)
5649 return;
5650
5651 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5652
5653 if (Opc == AMDGPUISD::BFE_U32)
5654 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5655
5656 break;
5657 }
5658 case AMDGPUISD::FP_TO_FP16: {
5659 unsigned BitWidth = Known.getBitWidth();
5660
5661 // High bits are zero.
5663 break;
5664 }
5665 case AMDGPUISD::MUL_U24:
5666 case AMDGPUISD::MUL_I24: {
5667 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5668 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5669 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5670 RHSKnown.countMinTrailingZeros();
5671 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5672 // Skip extra check if all bits are known zeros.
5673 if (TrailZ >= 32)
5674 break;
5675
5676 // Truncate to 24 bits.
5677 LHSKnown = LHSKnown.trunc(24);
5678 RHSKnown = RHSKnown.trunc(24);
5679
5680 if (Opc == AMDGPUISD::MUL_I24) {
5681 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5682 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5683 unsigned MaxValBits = LHSValBits + RHSValBits;
5684 if (MaxValBits > 32)
5685 break;
5686 unsigned SignBits = 32 - MaxValBits + 1;
5687 bool LHSNegative = LHSKnown.isNegative();
5688 bool LHSNonNegative = LHSKnown.isNonNegative();
5689 bool LHSPositive = LHSKnown.isStrictlyPositive();
5690 bool RHSNegative = RHSKnown.isNegative();
5691 bool RHSNonNegative = RHSKnown.isNonNegative();
5692 bool RHSPositive = RHSKnown.isStrictlyPositive();
5693
5694 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5695 Known.Zero.setHighBits(SignBits);
5696 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5697 Known.One.setHighBits(SignBits);
5698 } else {
5699 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5700 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5701 unsigned MaxValBits = LHSValBits + RHSValBits;
5702 if (MaxValBits >= 32)
5703 break;
5704 Known.Zero.setBitsFrom(MaxValBits);
5705 }
5706 break;
5707 }
5708 case AMDGPUISD::PERM: {
5709 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5710 if (!CMask)
5711 return;
5712
5713 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5714 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5715 unsigned Sel = CMask->getZExtValue();
5716
5717 for (unsigned I = 0; I < 32; I += 8) {
5718 unsigned SelBits = Sel & 0xff;
5719 if (SelBits < 4) {
5720 SelBits *= 8;
5721 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5722 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5723 } else if (SelBits < 7) {
5724 SelBits = (SelBits & 3) * 8;
5725 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5726 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5727 } else if (SelBits == 0x0c) {
5728 Known.Zero |= 0xFFull << I;
5729 } else if (SelBits > 0x0c) {
5730 Known.One |= 0xFFull << I;
5731 }
5732 Sel >>= 8;
5733 }
5734 break;
5735 }
5737 Known.Zero.setHighBits(24);
5738 break;
5739 }
5741 Known.Zero.setHighBits(16);
5742 break;
5743 }
5744 case AMDGPUISD::LDS: {
5745 auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5746 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5747
5748 Known.Zero.setHighBits(16);
5749 Known.Zero.setLowBits(Log2(Alignment));
5750 break;
5751 }
5752 case AMDGPUISD::SMIN3:
5753 case AMDGPUISD::SMAX3:
5754 case AMDGPUISD::SMED3:
5755 case AMDGPUISD::UMIN3:
5756 case AMDGPUISD::UMAX3:
5757 case AMDGPUISD::UMED3: {
5758 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5759 if (Known2.isUnknown())
5760 break;
5761
5762 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5763 if (Known1.isUnknown())
5764 break;
5765
5766 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5767 if (Known0.isUnknown())
5768 break;
5769
5770 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5771 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5772 Known.One = Known0.One & Known1.One & Known2.One;
5773 break;
5774 }
5776 unsigned IID = Op.getConstantOperandVal(0);
5777 switch (IID) {
5778 case Intrinsic::amdgcn_workitem_id_x:
5779 case Intrinsic::amdgcn_workitem_id_y:
5780 case Intrinsic::amdgcn_workitem_id_z: {
5781 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5783 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5784 break;
5785 }
5786 default:
5787 break;
5788 }
5789 }
5790 }
5791}
5792
5794 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5795 unsigned Depth) const {
5796 switch (Op.getOpcode()) {
5797 case AMDGPUISD::BFE_I32: {
5798 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5799 if (!Width)
5800 return 1;
5801
5802 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5803 if (!isNullConstant(Op.getOperand(1)))
5804 return SignBits;
5805
5806 // TODO: Could probably figure something out with non-0 offsets.
5807 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5808 return std::max(SignBits, Op0SignBits);
5809 }
5810
5811 case AMDGPUISD::BFE_U32: {
5812 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5813 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5814 }
5815
5816 case AMDGPUISD::CARRY:
5817 case AMDGPUISD::BORROW:
5818 return 31;
5820 return 25;
5822 return 17;
5824 return 24;
5826 return 16;
5828 return 16;
5829 case AMDGPUISD::SMIN3:
5830 case AMDGPUISD::SMAX3:
5831 case AMDGPUISD::SMED3:
5832 case AMDGPUISD::UMIN3:
5833 case AMDGPUISD::UMAX3:
5834 case AMDGPUISD::UMED3: {
5835 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5836 if (Tmp2 == 1)
5837 return 1; // Early out.
5838
5839 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5840 if (Tmp1 == 1)
5841 return 1; // Early out.
5842
5843 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5844 if (Tmp0 == 1)
5845 return 1; // Early out.
5846
5847 return std::min({Tmp0, Tmp1, Tmp2});
5848 }
5849 default:
5850 return 1;
5851 }
5852}
5853
5856 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
5857 unsigned Depth) const {
5858 const MachineInstr *MI = MRI.getVRegDef(R);
5859 if (!MI)
5860 return 1;
5861
5862 // TODO: Check range metadata on MMO.
5863 switch (MI->getOpcode()) {
5864 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5865 return 25;
5866 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5867 return 17;
5868 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5869 return 24;
5870 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5871 return 16;
5872 case AMDGPU::G_AMDGPU_SMED3:
5873 case AMDGPU::G_AMDGPU_UMED3: {
5874 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5875 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5876 if (Tmp2 == 1)
5877 return 1;
5878 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5879 if (Tmp1 == 1)
5880 return 1;
5881 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5882 if (Tmp0 == 1)
5883 return 1;
5884 return std::min({Tmp0, Tmp1, Tmp2});
5885 }
5886 default:
5887 return 1;
5888 }
5889}
5890
5892 const SelectionDAG &DAG,
5893 bool SNaN,
5894 unsigned Depth) const {
5895 unsigned Opcode = Op.getOpcode();
5896 switch (Opcode) {
5899 if (SNaN)
5900 return true;
5901
5902 // TODO: Can check no nans on one of the operands for each one, but which
5903 // one?
5904 return false;
5905 }
5908 if (SNaN)
5909 return true;
5910 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5911 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5912 }
5913 case AMDGPUISD::FMED3:
5914 case AMDGPUISD::FMIN3:
5915 case AMDGPUISD::FMAX3:
5918 case AMDGPUISD::FMAD_FTZ: {
5919 if (SNaN)
5920 return true;
5921 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
5922 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5923 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5924 }
5929 return true;
5930
5931 case AMDGPUISD::RCP:
5932 case AMDGPUISD::RSQ:
5934 case AMDGPUISD::RSQ_CLAMP: {
5935 if (SNaN)
5936 return true;
5937
5938 // TODO: Need is known positive check.
5939 return false;
5940 }
5941 case ISD::FLDEXP:
5942 case AMDGPUISD::FRACT: {
5943 if (SNaN)
5944 return true;
5945 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
5946 }
5950 // TODO: Refine on operands.
5951 return SNaN;
5952 case AMDGPUISD::SIN_HW:
5953 case AMDGPUISD::COS_HW: {
5954 // TODO: Need check for infinity
5955 return SNaN;
5956 }
5958 unsigned IntrinsicID = Op.getConstantOperandVal(0);
5959 // TODO: Handle more intrinsics
5960 switch (IntrinsicID) {
5961 case Intrinsic::amdgcn_cubeid:
5962 return true;
5963
5964 case Intrinsic::amdgcn_frexp_mant: {
5965 if (SNaN)
5966 return true;
5967 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
5968 }
5969 case Intrinsic::amdgcn_cvt_pkrtz: {
5970 if (SNaN)
5971 return true;
5972 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5973 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
5974 }
5975 case Intrinsic::amdgcn_rcp:
5976 case Intrinsic::amdgcn_rsq:
5977 case Intrinsic::amdgcn_rcp_legacy:
5978 case Intrinsic::amdgcn_rsq_legacy:
5979 case Intrinsic::amdgcn_rsq_clamp: {
5980 if (SNaN)
5981 return true;
5982
5983 // TODO: Need is known positive check.
5984 return false;
5985 }
5986 case Intrinsic::amdgcn_trig_preop:
5987 case Intrinsic::amdgcn_fdot2:
5988 // TODO: Refine on operand
5989 return SNaN;
5990 case Intrinsic::amdgcn_fma_legacy:
5991 if (SNaN)
5992 return true;
5993 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
5994 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
5995 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
5996 default:
5997 return false;
5998 }
5999 }
6000 default:
6001 return false;
6002 }
6003}
6004
6006 Register N0, Register N1) const {
6007 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6008}
6009
6012 switch (RMW->getOperation()) {
6019 case AtomicRMWInst::Xchg: {
6020 const DataLayout &DL = RMW->getFunction()->getDataLayout();
6021 unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
6022 if (ValSize == 32 || ValSize == 64)
6025 }
6026 default: {
6027 if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
6028 unsigned Size = IntTy->getBitWidth();
6029 if (Size == 32 || Size == 64)
6031 }
6032
6034 }
6035 }
6036}
6037
6038/// Whether it is profitable to sink the operands of an
6039/// Instruction I to the basic block of I.
6040/// This helps using several modifiers (like abs and neg) more often.
6042 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
6043 using namespace PatternMatch;
6044
6045 for (auto &Op : I->operands()) {
6046 // Ensure we are not already sinking this operand.
6047 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
6048 continue;
6049
6050 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
6051 Ops.push_back(&Op);
6052 }
6053
6054 return !Ops.empty();
6055}
unsigned const MachineRegisterInfo * MRI
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_READNONE
Definition: Compiler.h:220
#define LLVM_READONLY
Definition: Compiler.h:227
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
static LVOptions Options
Definition: LVOptions.cpp:25
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition: SHA256.cpp:34
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool Enabled
Definition: Statistic.cpp:46
Value * RHS
Value * LHS
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool hasFminFmaxLegacy() const
Align getAlignmentForImplicitArgPtr() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getVectorIdxTy(const DataLayout &) const override
Returns the type to be used for the index operand of: ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
bool bitwiseIsEqual(const APFloat &RHS) const
Definition: APFloat.h:1319
opStatus add(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1092
const fltSemantics & getSemantics() const
Definition: APFloat.h:1362
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition: APFloat.h:1110
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition: APFloat.h:1064
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:1004
Class for arbitrary precision integers.
Definition: APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1500
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1372
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1366
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:238
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1130
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:286
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1369
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ FSub
*p = old - v
Definition: Instructions.h:736
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
@ Nand
*p = ~(old & v)
Definition: Instructions.h:718
BinOp getOperation() const
Definition: Instructions.h:787
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
LLVMContext & getContext() const
void addLoc(const CCValAssign &V)
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
bool print(raw_ostream &OS, DIDumpOptions DumpOpts, const DWARFExpression *Expr, DWARFUnit *U) const
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:362
iterator_range< arg_iterator > args()
Definition: Function.h:855
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
op_iterator op_end() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:568
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:494
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:843
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:488
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getRegister(unsigned Reg, EVT VT)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:489
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:691
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:483
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:814
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:501
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:577
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:571
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
void setHasMultipleConditionRegisters(bool hasManyRegs=true)
Tells the code generator that the target has multiple (allocatable) condition registers that can be u...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:752
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1284
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:276
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1074
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:820
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:943
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:933
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:976
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:751
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:960
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1095
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1099
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:514
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:521
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1280
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:229
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:1021
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:771
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1008
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1084
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:828
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:918
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:765
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1140
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1027
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:866
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1251
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:286
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:981
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:899
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1137
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1578
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1558
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
constexpr double ln2
Definition: MathExtras.h:49
constexpr double ln10
Definition: MathExtras.h:50
constexpr float log2ef
Definition: MathExtras.h:66
constexpr double log2e
Definition: MathExtras.h:51
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
MaybeAlign getAlign(const Function &F, unsigned Index)
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:394
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeTypes
Definition: DAGCombine.h:17
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition: APFloat.h:1446
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:276
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:250
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:277
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:274
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition: ValueTypes.h:462
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition: ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition: ValueTypes.h:141
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition: ValueTypes.h:298
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:97
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:231
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:62
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:70
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:237
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:134
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition: KnownBits.h:103
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:94
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition: KnownBits.h:258
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...