LLVM 19.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
51}
52
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Lower floating point store/load to integer store/load to reduce the number
63 // of patterns in tablegen.
65 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
66
68 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
69
71 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
72
74 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
75
77 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
78
80 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
81
83 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
84
86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87
89 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
90
92 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
93
95 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
96
98 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
99
100 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
102
103 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
105
107 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
108
110 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
111
113 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
114
116 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
117
119 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
120
122 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
123
125 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
126
128 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
129
131 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
132
134 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
135
136 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
137 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
138
139 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
140 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
141
143 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
144
145 // There are no 64-bit extloads. These should be done as a 32-bit extload and
146 // an extension to 64-bit.
147 for (MVT VT : MVT::integer_valuetypes())
149 Expand);
150
151 for (MVT VT : MVT::integer_valuetypes()) {
152 if (VT == MVT::i64)
153 continue;
154
155 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
156 setLoadExtAction(Op, VT, MVT::i1, Promote);
157 setLoadExtAction(Op, VT, MVT::i8, Legal);
158 setLoadExtAction(Op, VT, MVT::i16, Legal);
159 setLoadExtAction(Op, VT, MVT::i32, Expand);
160 }
161 }
162
164 for (auto MemVT :
165 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
167 Expand);
168
169 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
172 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
173 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
174 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
175 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
177 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
178 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
183
184 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
185 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
190
191 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
192 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
193 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
194 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
195 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
196 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
197 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
198 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
199 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
200 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
201 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
202 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
203
205 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
206
208 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
209
211 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
212
214 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
215
217 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
218
220 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
221
223 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
224
226 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
227
229 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
230
232 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
233
235 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
236
238 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
239
241 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
242
244 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
245
247 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
248
250 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
251
253 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
254
256 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
257
259 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
260
262 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
263
265 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
266
268 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
269
271 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
272
274 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
275
277 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
278
280 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
281
283 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
284
285 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
286 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
287 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
288 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
289
290 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
291 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
292 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
293 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
294
295 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
296 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
297 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
298 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
299 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
300 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
301 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
302 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
303
304 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
305 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
306 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
307
308 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
309 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
310
311 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
312
313 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
314 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
315 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
316 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
317 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
318
319 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
320 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
321 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
322 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
323
324 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
325 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
326
327 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
328 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
329 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
330 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
331 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
332 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
333 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
334
335 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
336 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
337
339
340 // For R600, this is totally unsupported, just custom lower to produce an
341 // error.
343
344 // Library functions. These default to Expand, but we have instructions
345 // for them.
348 MVT::f32, Legal);
349
351 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
352
355 Custom);
356
357 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
358
359 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
360
361 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
362
363 if (Subtarget->has16BitInsts())
364 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
365 else {
366 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
368 }
369
371 Custom);
372
373 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
374 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
375 // default unless marked custom/legal.
378 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
379 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
380 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
381 Custom);
382
383 // Expand to fneg + fadd.
385
387 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
388 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
389 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
390 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
391 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
392 Custom);
393
394 // FIXME: Why is v8f16/v8bf16 missing?
397 {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16,
398 MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32,
399 MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32,
400 MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32,
401 MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32,
402 MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16,
403 MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
404 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
405 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64,
406 MVT::v32i16, MVT::v32f16, MVT::v32bf16},
407 Custom);
408
410 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
411
412 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
413 for (MVT VT : ScalarIntVTs) {
414 // These should use [SU]DIVREM, so set them to expand
416 Expand);
417
418 // GPU does not have divrem function for signed or unsigned.
420
421 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
423
425
426 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
428 }
429
430 // The hardware supports 32-bit FSHR, but not FSHL.
432
433 // The hardware supports 32-bit ROTR, but not ROTL.
434 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
436
438
442 MVT::i64, Custom);
444
446 Legal);
447
450 MVT::i64, Custom);
451
452 for (auto VT : {MVT::i8, MVT::i16})
454
455 static const MVT::SimpleValueType VectorIntTypes[] = {
456 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
457 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
458
459 for (MVT VT : VectorIntTypes) {
460 // Expand the following operations for the current type by default.
472 ISD::SETCC},
473 VT, Expand);
474 }
475
476 static const MVT::SimpleValueType FloatVectorTypes[] = {
477 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
478 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
479
480 for (MVT VT : FloatVectorTypes) {
493 VT, Expand);
494 }
495
496 // This causes using an unrolled select operation rather than expansion with
497 // bit operations. This is in general better, but the alternative using BFI
498 // instructions may be better if the select sources are SGPRs.
500 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
501
503 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
504
506 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
507
509 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
510
512 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
513
515 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
516
518 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
519
521 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
522
524 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
525
527 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
528
529 // Disable most libcalls.
530 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
531 if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
532 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
533 }
534
536 setJumpIsExpensive(true);
537
538 // FIXME: This is only partially true. If we have to do vector compares, any
539 // SGPR pair can be a condition register. If we have a uniform condition, we
540 // are better off doing SALU operations, where there is only one SCC. For now,
541 // we don't have a way of knowing during instruction selection if a condition
542 // will be uniform and we always use vector compares. Assume we are using
543 // vector compares until that is fixed.
545
548
550
551 // We want to find all load dependencies for long chains of stores to enable
552 // merging into very wide vectors. The problem is with vectors with > 4
553 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
554 // vectors are a legal type, even though we have to split the loads
555 // usually. When we can more precisely specify load legality per address
556 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
557 // smarter so that they can figure out what to do in 2 iterations without all
558 // N > 4 stores on the same chain.
560
561 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
562 // about these during lowering.
563 MaxStoresPerMemcpy = 0xffffffff;
564 MaxStoresPerMemmove = 0xffffffff;
565 MaxStoresPerMemset = 0xffffffff;
566
567 // The expansion for 64-bit division is enormous.
569 addBypassSlowDiv(64, 32);
570
581
584}
585
587 if (getTargetMachine().Options.NoSignedZerosFPMath)
588 return true;
589
590 const auto Flags = Op.getNode()->getFlags();
591 if (Flags.hasNoSignedZeros())
592 return true;
593
594 return false;
595}
596
597//===----------------------------------------------------------------------===//
598// Target Information
599//===----------------------------------------------------------------------===//
600
602static bool fnegFoldsIntoOpcode(unsigned Opc) {
603 switch (Opc) {
604 case ISD::FADD:
605 case ISD::FSUB:
606 case ISD::FMUL:
607 case ISD::FMA:
608 case ISD::FMAD:
609 case ISD::FMINNUM:
610 case ISD::FMAXNUM:
613 case ISD::FMINIMUM:
614 case ISD::FMAXIMUM:
615 case ISD::SELECT:
616 case ISD::FSIN:
617 case ISD::FTRUNC:
618 case ISD::FRINT:
619 case ISD::FNEARBYINT:
620 case ISD::FROUNDEVEN:
622 case AMDGPUISD::RCP:
629 case AMDGPUISD::FMED3:
630 // TODO: handle llvm.amdgcn.fma.legacy
631 return true;
632 case ISD::BITCAST:
633 llvm_unreachable("bitcast is special cased");
634 default:
635 return false;
636 }
637}
638
639static bool fnegFoldsIntoOp(const SDNode *N) {
640 unsigned Opc = N->getOpcode();
641 if (Opc == ISD::BITCAST) {
642 // TODO: Is there a benefit to checking the conditions performFNegCombine
643 // does? We don't for the other cases.
644 SDValue BCSrc = N->getOperand(0);
645 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
646 return BCSrc.getNumOperands() == 2 &&
647 BCSrc.getOperand(1).getValueSizeInBits() == 32;
648 }
649
650 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
651 }
652
653 return fnegFoldsIntoOpcode(Opc);
654}
655
656/// \p returns true if the operation will definitely need to use a 64-bit
657/// encoding, and thus will use a VOP3 encoding regardless of the source
658/// modifiers.
660static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
661 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
662 VT == MVT::f64;
663}
664
665/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
666/// type for ISD::SELECT.
668static bool selectSupportsSourceMods(const SDNode *N) {
669 // TODO: Only applies if select will be vector
670 return N->getValueType(0) == MVT::f32;
671}
672
673// Most FP instructions support source modifiers, but this could be refined
674// slightly.
676static bool hasSourceMods(const SDNode *N) {
677 if (isa<MemSDNode>(N))
678 return false;
679
680 switch (N->getOpcode()) {
681 case ISD::CopyToReg:
682 case ISD::FDIV:
683 case ISD::FREM:
684 case ISD::INLINEASM:
688
689 // TODO: Should really be looking at the users of the bitcast. These are
690 // problematic because bitcasts are used to legalize all stores to integer
691 // types.
692 case ISD::BITCAST:
693 return false;
695 switch (N->getConstantOperandVal(0)) {
696 case Intrinsic::amdgcn_interp_p1:
697 case Intrinsic::amdgcn_interp_p2:
698 case Intrinsic::amdgcn_interp_mov:
699 case Intrinsic::amdgcn_interp_p1_f16:
700 case Intrinsic::amdgcn_interp_p2_f16:
701 return false;
702 default:
703 return true;
704 }
705 }
706 case ISD::SELECT:
708 default:
709 return true;
710 }
711}
712
714 unsigned CostThreshold) {
715 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
716 // it is truly free to use a source modifier in all cases. If there are
717 // multiple users but for each one will necessitate using VOP3, there will be
718 // a code size increase. Try to avoid increasing code size unless we know it
719 // will save on the instruction count.
720 unsigned NumMayIncreaseSize = 0;
721 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
722
723 assert(!N->use_empty());
724
725 // XXX - Should this limit number of uses to check?
726 for (const SDNode *U : N->uses()) {
727 if (!hasSourceMods(U))
728 return false;
729
730 if (!opMustUseVOP3Encoding(U, VT)) {
731 if (++NumMayIncreaseSize > CostThreshold)
732 return false;
733 }
734 }
735
736 return true;
737}
738
740 ISD::NodeType ExtendKind) const {
741 assert(!VT.isVector() && "only scalar expected");
742
743 // Round to the next multiple of 32-bits.
744 unsigned Size = VT.getSizeInBits();
745 if (Size <= 32)
746 return MVT::i32;
747 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
748}
749
751 return MVT::i32;
752}
753
755 return true;
756}
757
758// The backend supports 32 and 64 bit floating point immediates.
759// FIXME: Why are we reporting vectors of FP immediates as legal?
761 bool ForCodeSize) const {
762 EVT ScalarVT = VT.getScalarType();
763 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
764 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
765}
766
767// We don't want to shrink f64 / f32 constants.
769 EVT ScalarVT = VT.getScalarType();
770 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
771}
772
774 ISD::LoadExtType ExtTy,
775 EVT NewVT) const {
776 // TODO: This may be worth removing. Check regression tests for diffs.
778 return false;
779
780 unsigned NewSize = NewVT.getStoreSizeInBits();
781
782 // If we are reducing to a 32-bit load or a smaller multi-dword load,
783 // this is always better.
784 if (NewSize >= 32)
785 return true;
786
787 EVT OldVT = N->getValueType(0);
788 unsigned OldSize = OldVT.getStoreSizeInBits();
789
790 MemSDNode *MN = cast<MemSDNode>(N);
791 unsigned AS = MN->getAddressSpace();
792 // Do not shrink an aligned scalar load to sub-dword.
793 // Scalar engine cannot do sub-dword loads.
794 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
795 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
798 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
799 MN->isInvariant())) &&
801 return false;
802
803 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
804 // extloads, so doing one requires using a buffer_load. In cases where we
805 // still couldn't use a scalar load, using the wider load shouldn't really
806 // hurt anything.
807
808 // If the old size already had to be an extload, there's no harm in continuing
809 // to reduce the width.
810 return (OldSize < 32);
811}
812
814 const SelectionDAG &DAG,
815 const MachineMemOperand &MMO) const {
816
817 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
818
819 if (LoadTy.getScalarType() == MVT::i32)
820 return false;
821
822 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
823 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
824
825 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
826 return false;
827
828 unsigned Fast = 0;
830 CastTy, MMO, &Fast) &&
831 Fast;
832}
833
834// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
835// profitable with the expansion for 64-bit since it's generally good to
836// speculate things.
838 return true;
839}
840
842 return true;
843}
844
846 switch (N->getOpcode()) {
847 case ISD::EntryToken:
848 case ISD::TokenFactor:
849 return true;
851 unsigned IntrID = N->getConstantOperandVal(0);
852 switch (IntrID) {
853 case Intrinsic::amdgcn_readfirstlane:
854 case Intrinsic::amdgcn_readlane:
855 return true;
856 }
857 return false;
858 }
859 case ISD::LOAD:
860 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
862 return true;
863 return false;
864 case AMDGPUISD::SETCC: // ballot-style instruction
865 return true;
866 }
867 return false;
868}
869
871 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
872 NegatibleCost &Cost, unsigned Depth) const {
873
874 switch (Op.getOpcode()) {
875 case ISD::FMA:
876 case ISD::FMAD: {
877 // Negating a fma is not free if it has users without source mods.
878 if (!allUsesHaveSourceMods(Op.getNode()))
879 return SDValue();
880 break;
881 }
882 case AMDGPUISD::RCP: {
883 SDValue Src = Op.getOperand(0);
884 EVT VT = Op.getValueType();
885 SDLoc SL(Op);
886
887 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
888 ForCodeSize, Cost, Depth + 1);
889 if (NegSrc)
890 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
891 return SDValue();
892 }
893 default:
894 break;
895 }
896
897 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
898 ForCodeSize, Cost, Depth);
899}
900
901//===---------------------------------------------------------------------===//
902// Target Properties
903//===---------------------------------------------------------------------===//
904
907
908 // Packed operations do not have a fabs modifier.
909 return VT == MVT::f32 || VT == MVT::f64 ||
910 (Subtarget->has16BitInsts() && VT == MVT::f16);
911}
912
915 // Report this based on the end legalized type.
916 VT = VT.getScalarType();
917 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
918}
919
921 unsigned NumElem,
922 unsigned AS) const {
923 return true;
924}
925
927 // There are few operations which truly have vector input operands. Any vector
928 // operation is going to involve operations on each component, and a
929 // build_vector will be a copy per element, so it always makes sense to use a
930 // build_vector input in place of the extracted element to avoid a copy into a
931 // super register.
932 //
933 // We should probably only do this if all users are extracts only, but this
934 // should be the common case.
935 return true;
936}
937
939 // Truncate is just accessing a subregister.
940
941 unsigned SrcSize = Source.getSizeInBits();
942 unsigned DestSize = Dest.getSizeInBits();
943
944 return DestSize < SrcSize && DestSize % 32 == 0 ;
945}
946
948 // Truncate is just accessing a subregister.
949
950 unsigned SrcSize = Source->getScalarSizeInBits();
951 unsigned DestSize = Dest->getScalarSizeInBits();
952
953 if (DestSize== 16 && Subtarget->has16BitInsts())
954 return SrcSize >= 32;
955
956 return DestSize < SrcSize && DestSize % 32 == 0;
957}
958
960 unsigned SrcSize = Src->getScalarSizeInBits();
961 unsigned DestSize = Dest->getScalarSizeInBits();
962
963 if (SrcSize == 16 && Subtarget->has16BitInsts())
964 return DestSize >= 32;
965
966 return SrcSize == 32 && DestSize == 64;
967}
968
970 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
971 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
972 // this will enable reducing 64-bit operations the 32-bit, which is always
973 // good.
974
975 if (Src == MVT::i16)
976 return Dest == MVT::i32 ||Dest == MVT::i64 ;
977
978 return Src == MVT::i32 && Dest == MVT::i64;
979}
980
982 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
983 // limited number of native 64-bit operations. Shrinking an operation to fit
984 // in a single 32-bit register should always be helpful. As currently used,
985 // this is much less general than the name suggests, and is only used in
986 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
987 // not profitable, and may actually be harmful.
988 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
989}
990
992 const SDNode* N, CombineLevel Level) const {
993 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
994 N->getOpcode() == ISD::SRL) &&
995 "Expected shift op");
996 // Always commute pre-type legalization and right shifts.
997 // We're looking for shl(or(x,y),z) patterns.
999 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1000 return true;
1001
1002 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1003 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
1004 (N->use_begin()->getOpcode() == ISD::SRA ||
1005 N->use_begin()->getOpcode() == ISD::SRL))
1006 return false;
1007
1008 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1009 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1010 if (LHS.getOpcode() != ISD::SHL)
1011 return false;
1012 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1013 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1014 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1015 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1016 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1017 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1018 };
1019 SDValue LHS = N->getOperand(0).getOperand(0);
1020 SDValue RHS = N->getOperand(0).getOperand(1);
1021 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1022}
1023
1024//===---------------------------------------------------------------------===//
1025// TargetLowering Callbacks
1026//===---------------------------------------------------------------------===//
1027
1029 bool IsVarArg) {
1030 switch (CC) {
1038 return CC_AMDGPU;
1041 return CC_AMDGPU_CS_CHAIN;
1042 case CallingConv::C:
1043 case CallingConv::Fast:
1044 case CallingConv::Cold:
1045 return CC_AMDGPU_Func;
1047 return CC_SI_Gfx;
1050 default:
1051 report_fatal_error("Unsupported calling convention for call");
1052 }
1053}
1054
1056 bool IsVarArg) {
1057 switch (CC) {
1060 llvm_unreachable("kernels should not be handled here");
1070 return RetCC_SI_Shader;
1072 return RetCC_SI_Gfx;
1073 case CallingConv::C:
1074 case CallingConv::Fast:
1075 case CallingConv::Cold:
1076 return RetCC_AMDGPU_Func;
1077 default:
1078 report_fatal_error("Unsupported calling convention.");
1079 }
1080}
1081
1082/// The SelectionDAGBuilder will automatically promote function arguments
1083/// with illegal types. However, this does not work for the AMDGPU targets
1084/// since the function arguments are stored in memory as these illegal types.
1085/// In order to handle this properly we need to get the original types sizes
1086/// from the LLVM IR Function and fixup the ISD:InputArg values before
1087/// passing them to AnalyzeFormalArguments()
1088
1089/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1090/// input values across multiple registers. Each item in the Ins array
1091/// represents a single value that will be stored in registers. Ins[x].VT is
1092/// the value type of the value that will be stored in the register, so
1093/// whatever SDNode we lower the argument to needs to be this type.
1094///
1095/// In order to correctly lower the arguments we need to know the size of each
1096/// argument. Since Ins[x].VT gives us the size of the register that will
1097/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1098/// for the original function argument so that we can deduce the correct memory
1099/// type to use for Ins[x]. In most cases the correct memory type will be
1100/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1101/// we have a kernel argument of type v8i8, this argument will be split into
1102/// 8 parts and each part will be represented by its own item in the Ins array.
1103/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1104/// the argument before it was split. From this, we deduce that the memory type
1105/// for each individual part is i8. We pass the memory type as LocVT to the
1106/// calling convention analysis function and the register type (Ins[x].VT) as
1107/// the ValVT.
1109 CCState &State,
1110 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1111 const MachineFunction &MF = State.getMachineFunction();
1112 const Function &Fn = MF.getFunction();
1113 LLVMContext &Ctx = Fn.getParent()->getContext();
1114 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1115 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1117
1118 Align MaxAlign = Align(1);
1119 uint64_t ExplicitArgOffset = 0;
1120 const DataLayout &DL = Fn.getParent()->getDataLayout();
1121
1122 unsigned InIndex = 0;
1123
1124 for (const Argument &Arg : Fn.args()) {
1125 const bool IsByRef = Arg.hasByRefAttr();
1126 Type *BaseArgTy = Arg.getType();
1127 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1128 Align Alignment = DL.getValueOrABITypeAlignment(
1129 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1130 MaxAlign = std::max(Alignment, MaxAlign);
1131 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1132
1133 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1134 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1135
1136 // We're basically throwing away everything passed into us and starting over
1137 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1138 // to us as computed in Ins.
1139 //
1140 // We also need to figure out what type legalization is trying to do to get
1141 // the correct memory offsets.
1142
1143 SmallVector<EVT, 16> ValueVTs;
1145 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1146
1147 for (unsigned Value = 0, NumValues = ValueVTs.size();
1148 Value != NumValues; ++Value) {
1149 uint64_t BasePartOffset = Offsets[Value];
1150
1151 EVT ArgVT = ValueVTs[Value];
1152 EVT MemVT = ArgVT;
1153 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1154 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1155
1156 if (NumRegs == 1) {
1157 // This argument is not split, so the IR type is the memory type.
1158 if (ArgVT.isExtended()) {
1159 // We have an extended type, like i24, so we should just use the
1160 // register type.
1161 MemVT = RegisterVT;
1162 } else {
1163 MemVT = ArgVT;
1164 }
1165 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1166 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1167 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1168 // We have a vector value which has been split into a vector with
1169 // the same scalar type, but fewer elements. This should handle
1170 // all the floating-point vector types.
1171 MemVT = RegisterVT;
1172 } else if (ArgVT.isVector() &&
1173 ArgVT.getVectorNumElements() == NumRegs) {
1174 // This arg has been split so that each element is stored in a separate
1175 // register.
1176 MemVT = ArgVT.getScalarType();
1177 } else if (ArgVT.isExtended()) {
1178 // We have an extended type, like i65.
1179 MemVT = RegisterVT;
1180 } else {
1181 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1182 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1183 if (RegisterVT.isInteger()) {
1184 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1185 } else if (RegisterVT.isVector()) {
1186 assert(!RegisterVT.getScalarType().isFloatingPoint());
1187 unsigned NumElements = RegisterVT.getVectorNumElements();
1188 assert(MemoryBits % NumElements == 0);
1189 // This vector type has been split into another vector type with
1190 // a different elements size.
1191 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1192 MemoryBits / NumElements);
1193 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1194 } else {
1195 llvm_unreachable("cannot deduce memory type.");
1196 }
1197 }
1198
1199 // Convert one element vectors to scalar.
1200 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1201 MemVT = MemVT.getScalarType();
1202
1203 // Round up vec3/vec5 argument.
1204 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1205 assert(MemVT.getVectorNumElements() == 3 ||
1206 MemVT.getVectorNumElements() == 5 ||
1207 (MemVT.getVectorNumElements() >= 9 &&
1208 MemVT.getVectorNumElements() <= 12));
1209 MemVT = MemVT.getPow2VectorType(State.getContext());
1210 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1211 MemVT = MemVT.getRoundIntegerType(State.getContext());
1212 }
1213
1214 unsigned PartOffset = 0;
1215 for (unsigned i = 0; i != NumRegs; ++i) {
1216 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1217 BasePartOffset + PartOffset,
1218 MemVT.getSimpleVT(),
1220 PartOffset += MemVT.getStoreSize();
1221 }
1222 }
1223 }
1224}
1225
1227 SDValue Chain, CallingConv::ID CallConv,
1228 bool isVarArg,
1230 const SmallVectorImpl<SDValue> &OutVals,
1231 const SDLoc &DL, SelectionDAG &DAG) const {
1232 // FIXME: Fails for r600 tests
1233 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1234 // "wave terminate should not have return values");
1235 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1236}
1237
1238//===---------------------------------------------------------------------===//
1239// Target specific lowering
1240//===---------------------------------------------------------------------===//
1241
1242/// Selects the correct CCAssignFn for a given CallingConvention value.
1244 bool IsVarArg) {
1245 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1246}
1247
1249 bool IsVarArg) {
1251}
1252
1254 SelectionDAG &DAG,
1255 MachineFrameInfo &MFI,
1256 int ClobberedFI) const {
1257 SmallVector<SDValue, 8> ArgChains;
1258 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1259 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1260
1261 // Include the original chain at the beginning of the list. When this is
1262 // used by target LowerCall hooks, this helps legalize find the
1263 // CALLSEQ_BEGIN node.
1264 ArgChains.push_back(Chain);
1265
1266 // Add a chain value for each stack argument corresponding
1267 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1268 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1269 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1270 if (FI->getIndex() < 0) {
1271 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1272 int64_t InLastByte = InFirstByte;
1273 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1274
1275 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1276 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1277 ArgChains.push_back(SDValue(L, 1));
1278 }
1279 }
1280 }
1281 }
1282
1283 // Build a tokenfactor for all the chains.
1284 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1285}
1286
1289 StringRef Reason) const {
1290 SDValue Callee = CLI.Callee;
1291 SelectionDAG &DAG = CLI.DAG;
1292
1293 const Function &Fn = DAG.getMachineFunction().getFunction();
1294
1295 StringRef FuncName("<unknown>");
1296
1297 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1298 FuncName = G->getSymbol();
1299 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1300 FuncName = G->getGlobal()->getName();
1301
1303 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1304 DAG.getContext()->diagnose(NoCalls);
1305
1306 if (!CLI.IsTailCall) {
1307 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1308 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1309 }
1310
1311 return DAG.getEntryNode();
1312}
1313
1315 SmallVectorImpl<SDValue> &InVals) const {
1316 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1317}
1318
1320 SelectionDAG &DAG) const {
1321 const Function &Fn = DAG.getMachineFunction().getFunction();
1322
1323 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1324 SDLoc(Op).getDebugLoc());
1325 DAG.getContext()->diagnose(NoDynamicAlloca);
1326 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1327 return DAG.getMergeValues(Ops, SDLoc());
1328}
1329
1331 SelectionDAG &DAG) const {
1332 switch (Op.getOpcode()) {
1333 default:
1334 Op->print(errs(), &DAG);
1335 llvm_unreachable("Custom lowering code for this "
1336 "instruction is not implemented yet!");
1337 break;
1339 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1341 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1342 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1343 case ISD::FREM: return LowerFREM(Op, DAG);
1344 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1345 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1346 case ISD::FRINT: return LowerFRINT(Op, DAG);
1347 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1348 case ISD::FROUNDEVEN:
1349 return LowerFROUNDEVEN(Op, DAG);
1350 case ISD::FROUND: return LowerFROUND(Op, DAG);
1351 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1352 case ISD::FLOG2:
1353 return LowerFLOG2(Op, DAG);
1354 case ISD::FLOG:
1355 case ISD::FLOG10:
1356 return LowerFLOGCommon(Op, DAG);
1357 case ISD::FEXP:
1358 case ISD::FEXP10:
1359 return lowerFEXP(Op, DAG);
1360 case ISD::FEXP2:
1361 return lowerFEXP2(Op, DAG);
1362 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1363 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1364 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1365 case ISD::FP_TO_SINT:
1366 case ISD::FP_TO_UINT:
1367 return LowerFP_TO_INT(Op, DAG);
1368 case ISD::CTTZ:
1370 case ISD::CTLZ:
1372 return LowerCTLZ_CTTZ(Op, DAG);
1374 }
1375 return Op;
1376}
1377
1380 SelectionDAG &DAG) const {
1381 switch (N->getOpcode()) {
1383 // Different parts of legalization seem to interpret which type of
1384 // sign_extend_inreg is the one to check for custom lowering. The extended
1385 // from type is what really matters, but some places check for custom
1386 // lowering of the result type. This results in trying to use
1387 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1388 // nothing here and let the illegal result integer be handled normally.
1389 return;
1390 case ISD::FLOG2:
1391 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1392 Results.push_back(Lowered);
1393 return;
1394 case ISD::FLOG:
1395 case ISD::FLOG10:
1396 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1397 Results.push_back(Lowered);
1398 return;
1399 case ISD::FEXP2:
1400 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1401 Results.push_back(Lowered);
1402 return;
1403 case ISD::FEXP:
1404 case ISD::FEXP10:
1405 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1406 Results.push_back(Lowered);
1407 return;
1408 case ISD::CTLZ:
1410 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1411 Results.push_back(Lowered);
1412 return;
1413 default:
1414 return;
1415 }
1416}
1417
1419 SDValue Op,
1420 SelectionDAG &DAG) const {
1421
1422 const DataLayout &DL = DAG.getDataLayout();
1423 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1424 const GlobalValue *GV = G->getGlobal();
1425
1426 if (!MFI->isModuleEntryFunction()) {
1427 if (std::optional<uint32_t> Address =
1429 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1430 }
1431 }
1432
1433 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1434 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1435 if (!MFI->isModuleEntryFunction() &&
1436 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1437 SDLoc DL(Op);
1438 const Function &Fn = DAG.getMachineFunction().getFunction();
1439 DiagnosticInfoUnsupported BadLDSDecl(
1440 Fn, "local memory global used by non-kernel function",
1441 DL.getDebugLoc(), DS_Warning);
1442 DAG.getContext()->diagnose(BadLDSDecl);
1443
1444 // We currently don't have a way to correctly allocate LDS objects that
1445 // aren't directly associated with a kernel. We do force inlining of
1446 // functions that use local objects. However, if these dead functions are
1447 // not eliminated, we don't want a compile time error. Just emit a warning
1448 // and a trap, since there should be no callable path here.
1449 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1450 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1451 Trap, DAG.getRoot());
1452 DAG.setRoot(OutputChain);
1453 return DAG.getUNDEF(Op.getValueType());
1454 }
1455
1456 // XXX: What does the value of G->getOffset() mean?
1457 assert(G->getOffset() == 0 &&
1458 "Do not know what to do with an non-zero offset");
1459
1460 // TODO: We could emit code to handle the initialization somewhere.
1461 // We ignore the initializer for now and legalize it to allow selection.
1462 // The initializer will anyway get errored out during assembly emission.
1463 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1464 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1465 }
1466 return SDValue();
1467}
1468
1470 SelectionDAG &DAG) const {
1472 SDLoc SL(Op);
1473
1474 EVT VT = Op.getValueType();
1475 if (VT.getVectorElementType().getSizeInBits() < 32) {
1476 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1477 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1478 unsigned NewNumElt = OpBitSize / 32;
1479 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1481 MVT::i32, NewNumElt);
1482 for (const SDUse &U : Op->ops()) {
1483 SDValue In = U.get();
1484 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1485 if (NewNumElt > 1)
1486 DAG.ExtractVectorElements(NewIn, Args);
1487 else
1488 Args.push_back(NewIn);
1489 }
1490
1491 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1492 NewNumElt * Op.getNumOperands());
1493 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1494 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1495 }
1496 }
1497
1498 for (const SDUse &U : Op->ops())
1499 DAG.ExtractVectorElements(U.get(), Args);
1500
1501 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1502}
1503
1505 SelectionDAG &DAG) const {
1506 SDLoc SL(Op);
1508 unsigned Start = Op.getConstantOperandVal(1);
1509 EVT VT = Op.getValueType();
1510 EVT SrcVT = Op.getOperand(0).getValueType();
1511
1512 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1513 unsigned NumElt = VT.getVectorNumElements();
1514 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1515 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1516
1517 // Extract 32-bit registers at a time.
1518 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1519 EVT NewVT = NumElt == 2
1520 ? MVT::i32
1521 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1522 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1523
1524 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1525 if (NumElt == 2)
1526 Tmp = Args[0];
1527 else
1528 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1529
1530 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1531 }
1532
1533 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1535
1536 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1537}
1538
1539// TODO: Handle fabs too
1541 if (Val.getOpcode() == ISD::FNEG)
1542 return Val.getOperand(0);
1543
1544 return Val;
1545}
1546
1548 if (Val.getOpcode() == ISD::FNEG)
1549 Val = Val.getOperand(0);
1550 if (Val.getOpcode() == ISD::FABS)
1551 Val = Val.getOperand(0);
1552 if (Val.getOpcode() == ISD::FCOPYSIGN)
1553 Val = Val.getOperand(0);
1554 return Val;
1555}
1556
1558 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1559 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1560 SelectionDAG &DAG = DCI.DAG;
1561 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1562 switch (CCOpcode) {
1563 case ISD::SETOEQ:
1564 case ISD::SETONE:
1565 case ISD::SETUNE:
1566 case ISD::SETNE:
1567 case ISD::SETUEQ:
1568 case ISD::SETEQ:
1569 case ISD::SETFALSE:
1570 case ISD::SETFALSE2:
1571 case ISD::SETTRUE:
1572 case ISD::SETTRUE2:
1573 case ISD::SETUO:
1574 case ISD::SETO:
1575 break;
1576 case ISD::SETULE:
1577 case ISD::SETULT: {
1578 if (LHS == True)
1579 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1580 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1581 }
1582 case ISD::SETOLE:
1583 case ISD::SETOLT:
1584 case ISD::SETLE:
1585 case ISD::SETLT: {
1586 // Ordered. Assume ordered for undefined.
1587
1588 // Only do this after legalization to avoid interfering with other combines
1589 // which might occur.
1591 !DCI.isCalledByLegalizer())
1592 return SDValue();
1593
1594 // We need to permute the operands to get the correct NaN behavior. The
1595 // selected operand is the second one based on the failing compare with NaN,
1596 // so permute it based on the compare type the hardware uses.
1597 if (LHS == True)
1598 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1599 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1600 }
1601 case ISD::SETUGE:
1602 case ISD::SETUGT: {
1603 if (LHS == True)
1604 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1605 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1606 }
1607 case ISD::SETGT:
1608 case ISD::SETGE:
1609 case ISD::SETOGE:
1610 case ISD::SETOGT: {
1612 !DCI.isCalledByLegalizer())
1613 return SDValue();
1614
1615 if (LHS == True)
1616 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1617 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1618 }
1619 case ISD::SETCC_INVALID:
1620 llvm_unreachable("Invalid setcc condcode!");
1621 }
1622 return SDValue();
1623}
1624
1625/// Generate Min/Max node
1627 SDValue LHS, SDValue RHS,
1628 SDValue True, SDValue False,
1629 SDValue CC,
1630 DAGCombinerInfo &DCI) const {
1631 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1632 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1633
1634 SelectionDAG &DAG = DCI.DAG;
1635
1636 // If we can't directly match this, try to see if we can fold an fneg to
1637 // match.
1638
1639 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1640 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1641 SDValue NegTrue = peekFNeg(True);
1642
1643 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1644 // fmin/fmax.
1645 //
1646 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1647 // -> fneg (fmin_legacy lhs, K)
1648 //
1649 // TODO: Use getNegatedExpression
1650 if (LHS == NegTrue && CFalse && CRHS) {
1651 APFloat NegRHS = neg(CRHS->getValueAPF());
1652 if (NegRHS == CFalse->getValueAPF()) {
1653 SDValue Combined =
1654 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1655 if (Combined)
1656 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1657 return SDValue();
1658 }
1659 }
1660
1661 return SDValue();
1662}
1663
1664std::pair<SDValue, SDValue>
1666 SDLoc SL(Op);
1667
1668 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1669
1670 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1671 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1672
1673 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1674 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1675
1676 return std::pair(Lo, Hi);
1677}
1678
1680 SDLoc SL(Op);
1681
1682 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1683 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1684 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1685}
1686
1688 SDLoc SL(Op);
1689
1690 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1691 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1692 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1693}
1694
1695// Split a vector type into two parts. The first part is a power of two vector.
1696// The second part is whatever is left over, and is a scalar if it would
1697// otherwise be a 1-vector.
1698std::pair<EVT, EVT>
1700 EVT LoVT, HiVT;
1701 EVT EltVT = VT.getVectorElementType();
1702 unsigned NumElts = VT.getVectorNumElements();
1703 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1704 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1705 HiVT = NumElts - LoNumElts == 1
1706 ? EltVT
1707 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1708 return std::pair(LoVT, HiVT);
1709}
1710
1711// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1712// scalar.
1713std::pair<SDValue, SDValue>
1715 const EVT &LoVT, const EVT &HiVT,
1716 SelectionDAG &DAG) const {
1718 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1719 N.getValueType().getVectorNumElements() &&
1720 "More vector elements requested than available!");
1722 DAG.getVectorIdxConstant(0, DL));
1723 SDValue Hi = DAG.getNode(
1725 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1726 return std::pair(Lo, Hi);
1727}
1728
1730 SelectionDAG &DAG) const {
1731 LoadSDNode *Load = cast<LoadSDNode>(Op);
1732 EVT VT = Op.getValueType();
1733 SDLoc SL(Op);
1734
1735
1736 // If this is a 2 element vector, we really want to scalarize and not create
1737 // weird 1 element vectors.
1738 if (VT.getVectorNumElements() == 2) {
1739 SDValue Ops[2];
1740 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1741 return DAG.getMergeValues(Ops, SL);
1742 }
1743
1744 SDValue BasePtr = Load->getBasePtr();
1745 EVT MemVT = Load->getMemoryVT();
1746
1747 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1748
1749 EVT LoVT, HiVT;
1750 EVT LoMemVT, HiMemVT;
1751 SDValue Lo, Hi;
1752
1753 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1754 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1755 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1756
1757 unsigned Size = LoMemVT.getStoreSize();
1758 Align BaseAlign = Load->getAlign();
1759 Align HiAlign = commonAlignment(BaseAlign, Size);
1760
1761 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1762 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1763 BaseAlign, Load->getMemOperand()->getFlags());
1764 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1765 SDValue HiLoad =
1766 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1767 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1768 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1769
1770 SDValue Join;
1771 if (LoVT == HiVT) {
1772 // This is the case that the vector is power of two so was evenly split.
1773 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1774 } else {
1775 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1776 DAG.getVectorIdxConstant(0, SL));
1777 Join = DAG.getNode(
1779 VT, Join, HiLoad,
1781 }
1782
1783 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1784 LoLoad.getValue(1), HiLoad.getValue(1))};
1785
1786 return DAG.getMergeValues(Ops, SL);
1787}
1788
1790 SelectionDAG &DAG) const {
1791 LoadSDNode *Load = cast<LoadSDNode>(Op);
1792 EVT VT = Op.getValueType();
1793 SDValue BasePtr = Load->getBasePtr();
1794 EVT MemVT = Load->getMemoryVT();
1795 SDLoc SL(Op);
1796 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1797 Align BaseAlign = Load->getAlign();
1798 unsigned NumElements = MemVT.getVectorNumElements();
1799
1800 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1801 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1802 if (NumElements != 3 ||
1803 (BaseAlign < Align(8) &&
1804 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1805 return SplitVectorLoad(Op, DAG);
1806
1807 assert(NumElements == 3);
1808
1809 EVT WideVT =
1811 EVT WideMemVT =
1813 SDValue WideLoad = DAG.getExtLoad(
1814 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1815 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1816 return DAG.getMergeValues(
1817 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1818 DAG.getVectorIdxConstant(0, SL)),
1819 WideLoad.getValue(1)},
1820 SL);
1821}
1822
1824 SelectionDAG &DAG) const {
1825 StoreSDNode *Store = cast<StoreSDNode>(Op);
1826 SDValue Val = Store->getValue();
1827 EVT VT = Val.getValueType();
1828
1829 // If this is a 2 element vector, we really want to scalarize and not create
1830 // weird 1 element vectors.
1831 if (VT.getVectorNumElements() == 2)
1832 return scalarizeVectorStore(Store, DAG);
1833
1834 EVT MemVT = Store->getMemoryVT();
1835 SDValue Chain = Store->getChain();
1836 SDValue BasePtr = Store->getBasePtr();
1837 SDLoc SL(Op);
1838
1839 EVT LoVT, HiVT;
1840 EVT LoMemVT, HiMemVT;
1841 SDValue Lo, Hi;
1842
1843 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1844 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1845 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1846
1847 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1848
1849 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1850 Align BaseAlign = Store->getAlign();
1851 unsigned Size = LoMemVT.getStoreSize();
1852 Align HiAlign = commonAlignment(BaseAlign, Size);
1853
1854 SDValue LoStore =
1855 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1856 Store->getMemOperand()->getFlags());
1857 SDValue HiStore =
1858 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1859 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1860
1861 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1862}
1863
1864// This is a shortcut for integer division because we have fast i32<->f32
1865// conversions, and fast f32 reciprocal instructions. The fractional part of a
1866// float is enough to accurately represent up to a 24-bit signed integer.
1868 bool Sign) const {
1869 SDLoc DL(Op);
1870 EVT VT = Op.getValueType();
1871 SDValue LHS = Op.getOperand(0);
1872 SDValue RHS = Op.getOperand(1);
1873 MVT IntVT = MVT::i32;
1874 MVT FltVT = MVT::f32;
1875
1876 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1877 if (LHSSignBits < 9)
1878 return SDValue();
1879
1880 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1881 if (RHSSignBits < 9)
1882 return SDValue();
1883
1884 unsigned BitSize = VT.getSizeInBits();
1885 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1886 unsigned DivBits = BitSize - SignBits;
1887 if (Sign)
1888 ++DivBits;
1889
1892
1893 SDValue jq = DAG.getConstant(1, DL, IntVT);
1894
1895 if (Sign) {
1896 // char|short jq = ia ^ ib;
1897 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1898
1899 // jq = jq >> (bitsize - 2)
1900 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1901 DAG.getConstant(BitSize - 2, DL, VT));
1902
1903 // jq = jq | 0x1
1904 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1905 }
1906
1907 // int ia = (int)LHS;
1908 SDValue ia = LHS;
1909
1910 // int ib, (int)RHS;
1911 SDValue ib = RHS;
1912
1913 // float fa = (float)ia;
1914 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1915
1916 // float fb = (float)ib;
1917 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1918
1919 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1920 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1921
1922 // fq = trunc(fq);
1923 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1924
1925 // float fqneg = -fq;
1926 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1927
1929
1930 bool UseFmadFtz = false;
1931 if (Subtarget->isGCN()) {
1933 UseFmadFtz =
1935 }
1936
1937 // float fr = mad(fqneg, fb, fa);
1938 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1939 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1941 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1942
1943 // int iq = (int)fq;
1944 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1945
1946 // fr = fabs(fr);
1947 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1948
1949 // fb = fabs(fb);
1950 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1951
1952 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1953
1954 // int cv = fr >= fb;
1955 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1956
1957 // jq = (cv ? jq : 0);
1958 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1959
1960 // dst = iq + jq;
1961 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1962
1963 // Rem needs compensation, it's easier to recompute it
1964 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1965 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1966
1967 // Truncate to number of bits this divide really is.
1968 if (Sign) {
1969 SDValue InRegSize
1970 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1971 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1972 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1973 } else {
1974 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1975 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1976 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1977 }
1978
1979 return DAG.getMergeValues({ Div, Rem }, DL);
1980}
1981
1983 SelectionDAG &DAG,
1985 SDLoc DL(Op);
1986 EVT VT = Op.getValueType();
1987
1988 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1989
1990 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1991
1992 SDValue One = DAG.getConstant(1, DL, HalfVT);
1993 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1994
1995 //HiLo split
1996 SDValue LHS_Lo, LHS_Hi;
1997 SDValue LHS = Op.getOperand(0);
1998 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
1999
2000 SDValue RHS_Lo, RHS_Hi;
2001 SDValue RHS = Op.getOperand(1);
2002 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2003
2004 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2006
2007 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2008 LHS_Lo, RHS_Lo);
2009
2010 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2011 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2012
2013 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2014 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2015 return;
2016 }
2017
2018 if (isTypeLegal(MVT::i64)) {
2019 // The algorithm here is based on ideas from "Software Integer Division",
2020 // Tom Rodeheffer, August 2008.
2021
2024
2025 // Compute denominator reciprocal.
2026 unsigned FMAD =
2027 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2030 : (unsigned)AMDGPUISD::FMAD_FTZ;
2031
2032 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2033 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2034 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2035 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2036 Cvt_Lo);
2037 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2038 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2039 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2040 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2041 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2042 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2043 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2044 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2045 Mul1);
2046 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2047 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2048 SDValue Rcp64 = DAG.getBitcast(VT,
2049 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2050
2051 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2052 SDValue One64 = DAG.getConstant(1, DL, VT);
2053 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2054 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2055
2056 // First round of UNR (Unsigned integer Newton-Raphson).
2057 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2058 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2059 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2060 SDValue Mulhi1_Lo, Mulhi1_Hi;
2061 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2062 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2063 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2064 Mulhi1_Lo, Zero1);
2065 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2066 Mulhi1_Hi, Add1_Lo.getValue(1));
2067 SDValue Add1 = DAG.getBitcast(VT,
2068 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2069
2070 // Second round of UNR.
2071 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2072 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2073 SDValue Mulhi2_Lo, Mulhi2_Hi;
2074 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2075 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2076 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2077 Mulhi2_Lo, Zero1);
2078 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2079 Mulhi2_Hi, Add2_Lo.getValue(1));
2080 SDValue Add2 = DAG.getBitcast(VT,
2081 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2082
2083 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2084
2085 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2086
2087 SDValue Mul3_Lo, Mul3_Hi;
2088 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2089 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2090 Mul3_Lo, Zero1);
2091 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2092 Mul3_Hi, Sub1_Lo.getValue(1));
2093 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2094 SDValue Sub1 = DAG.getBitcast(VT,
2095 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2096
2097 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2098 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2099 ISD::SETUGE);
2100 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2101 ISD::SETUGE);
2102 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2103
2104 // TODO: Here and below portions of the code can be enclosed into if/endif.
2105 // Currently control flow is unconditional and we have 4 selects after
2106 // potential endif to substitute PHIs.
2107
2108 // if C3 != 0 ...
2109 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2110 RHS_Lo, Zero1);
2111 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2112 RHS_Hi, Sub1_Lo.getValue(1));
2113 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2114 Zero, Sub2_Lo.getValue(1));
2115 SDValue Sub2 = DAG.getBitcast(VT,
2116 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2117
2118 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2119
2120 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2121 ISD::SETUGE);
2122 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2123 ISD::SETUGE);
2124 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2125
2126 // if (C6 != 0)
2127 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2128
2129 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2130 RHS_Lo, Zero1);
2131 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2132 RHS_Hi, Sub2_Lo.getValue(1));
2133 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2134 Zero, Sub3_Lo.getValue(1));
2135 SDValue Sub3 = DAG.getBitcast(VT,
2136 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2137
2138 // endif C6
2139 // endif C3
2140
2141 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2142 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2143
2144 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2145 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2146
2147 Results.push_back(Div);
2148 Results.push_back(Rem);
2149
2150 return;
2151 }
2152
2153 // r600 expandion.
2154 // Get Speculative values
2155 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2156 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2157
2158 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2159 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2160 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2161
2162 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2163 SDValue DIV_Lo = Zero;
2164
2165 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2166
2167 for (unsigned i = 0; i < halfBitWidth; ++i) {
2168 const unsigned bitPos = halfBitWidth - i - 1;
2169 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2170 // Get value of high bit
2171 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2172 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2173 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2174
2175 // Shift
2176 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2177 // Add LHS high bit
2178 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2179
2180 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2181 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2182
2183 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2184
2185 // Update REM
2186 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2187 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2188 }
2189
2190 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2191 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2192 Results.push_back(DIV);
2193 Results.push_back(REM);
2194}
2195
2197 SelectionDAG &DAG) const {
2198 SDLoc DL(Op);
2199 EVT VT = Op.getValueType();
2200
2201 if (VT == MVT::i64) {
2203 LowerUDIVREM64(Op, DAG, Results);
2204 return DAG.getMergeValues(Results, DL);
2205 }
2206
2207 if (VT == MVT::i32) {
2208 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2209 return Res;
2210 }
2211
2212 SDValue X = Op.getOperand(0);
2213 SDValue Y = Op.getOperand(1);
2214
2215 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2216 // algorithm used here.
2217
2218 // Initial estimate of inv(y).
2219 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2220
2221 // One round of UNR.
2222 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2223 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2224 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2225 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2226
2227 // Quotient/remainder estimate.
2228 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2229 SDValue R =
2230 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2231
2232 // First quotient/remainder refinement.
2233 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2234 SDValue One = DAG.getConstant(1, DL, VT);
2235 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2236 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2237 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2238 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2239 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2240
2241 // Second quotient/remainder refinement.
2242 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2243 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2244 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2245 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2246 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2247
2248 return DAG.getMergeValues({Q, R}, DL);
2249}
2250
2252 SelectionDAG &DAG) const {
2253 SDLoc DL(Op);
2254 EVT VT = Op.getValueType();
2255
2256 SDValue LHS = Op.getOperand(0);
2257 SDValue RHS = Op.getOperand(1);
2258
2259 SDValue Zero = DAG.getConstant(0, DL, VT);
2260 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2261
2262 if (VT == MVT::i32) {
2263 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2264 return Res;
2265 }
2266
2267 if (VT == MVT::i64 &&
2268 DAG.ComputeNumSignBits(LHS) > 32 &&
2269 DAG.ComputeNumSignBits(RHS) > 32) {
2270 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2271
2272 //HiLo split
2273 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2274 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2275 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2276 LHS_Lo, RHS_Lo);
2277 SDValue Res[2] = {
2278 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2279 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2280 };
2281 return DAG.getMergeValues(Res, DL);
2282 }
2283
2284 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2285 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2286 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2287 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2288
2289 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2290 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2291
2292 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2293 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2294
2295 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2296 SDValue Rem = Div.getValue(1);
2297
2298 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2299 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2300
2301 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2302 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2303
2304 SDValue Res[2] = {
2305 Div,
2306 Rem
2307 };
2308 return DAG.getMergeValues(Res, DL);
2309}
2310
2311// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2313 SDLoc SL(Op);
2314 EVT VT = Op.getValueType();
2315 auto Flags = Op->getFlags();
2316 SDValue X = Op.getOperand(0);
2317 SDValue Y = Op.getOperand(1);
2318
2319 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2320 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2321 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2322 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2323 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2324}
2325
2327 SDLoc SL(Op);
2328 SDValue Src = Op.getOperand(0);
2329
2330 // result = trunc(src)
2331 // if (src > 0.0 && src != result)
2332 // result += 1.0
2333
2334 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2335
2336 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2337 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2338
2339 EVT SetCCVT =
2340 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2341
2342 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2343 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2344 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2345
2346 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2347 // TODO: Should this propagate fast-math-flags?
2348 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2349}
2350
2352 SelectionDAG &DAG) {
2353 const unsigned FractBits = 52;
2354 const unsigned ExpBits = 11;
2355
2356 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2357 Hi,
2358 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2359 DAG.getConstant(ExpBits, SL, MVT::i32));
2360 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2361 DAG.getConstant(1023, SL, MVT::i32));
2362
2363 return Exp;
2364}
2365
2367 SDLoc SL(Op);
2368 SDValue Src = Op.getOperand(0);
2369
2370 assert(Op.getValueType() == MVT::f64);
2371
2372 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2373
2374 // Extract the upper half, since this is where we will find the sign and
2375 // exponent.
2376 SDValue Hi = getHiHalf64(Src, DAG);
2377
2378 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2379
2380 const unsigned FractBits = 52;
2381
2382 // Extract the sign bit.
2383 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2384 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2385
2386 // Extend back to 64-bits.
2387 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2388 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2389
2390 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2391 const SDValue FractMask
2392 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2393
2394 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2395 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2396 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2397
2398 EVT SetCCVT =
2399 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2400
2401 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2402
2403 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2404 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2405
2406 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2407 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2408
2409 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2410}
2411
2413 SelectionDAG &DAG) const {
2414 SDLoc SL(Op);
2415 SDValue Src = Op.getOperand(0);
2416
2417 assert(Op.getValueType() == MVT::f64);
2418
2419 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2420 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2421 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2422
2423 // TODO: Should this propagate fast-math-flags?
2424
2425 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2426 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2427
2428 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2429
2430 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2431 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2432
2433 EVT SetCCVT =
2434 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2435 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2436
2437 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2438}
2439
2441 SelectionDAG &DAG) const {
2442 // FNEARBYINT and FRINT are the same, except in their handling of FP
2443 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2444 // rint, so just treat them as equivalent.
2445 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2446 Op.getOperand(0));
2447}
2448
2450 auto VT = Op.getValueType();
2451 auto Arg = Op.getOperand(0u);
2452 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2453}
2454
2455// XXX - May require not supporting f32 denormals?
2456
2457// Don't handle v2f16. The extra instructions to scalarize and repack around the
2458// compare and vselect end up producing worse code than scalarizing the whole
2459// operation.
2461 SDLoc SL(Op);
2462 SDValue X = Op.getOperand(0);
2463 EVT VT = Op.getValueType();
2464
2465 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2466
2467 // TODO: Should this propagate fast-math-flags?
2468
2469 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2470
2471 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2472
2473 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2474 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2475
2476 EVT SetCCVT =
2477 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2478
2479 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2480 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2481 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2482
2483 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2484 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2485}
2486
2488 SDLoc SL(Op);
2489 SDValue Src = Op.getOperand(0);
2490
2491 // result = trunc(src);
2492 // if (src < 0.0 && src != result)
2493 // result += -1.0.
2494
2495 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2496
2497 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2498 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2499
2500 EVT SetCCVT =
2501 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2502
2503 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2504 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2505 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2506
2507 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2508 // TODO: Should this propagate fast-math-flags?
2509 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2510}
2511
2512/// Return true if it's known that \p Src can never be an f32 denormal value.
2514 switch (Src.getOpcode()) {
2515 case ISD::FP_EXTEND:
2516 return Src.getOperand(0).getValueType() == MVT::f16;
2517 case ISD::FP16_TO_FP:
2518 case ISD::FFREXP:
2519 return true;
2521 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2522 switch (IntrinsicID) {
2523 case Intrinsic::amdgcn_frexp_mant:
2524 return true;
2525 default:
2526 return false;
2527 }
2528 }
2529 default:
2530 return false;
2531 }
2532
2533 llvm_unreachable("covered opcode switch");
2534}
2535
2537 SDNodeFlags Flags) {
2538 if (Flags.hasApproximateFuncs())
2539 return true;
2540 auto &Options = DAG.getTarget().Options;
2541 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2542}
2543
2545 SDValue Src,
2546 SDNodeFlags Flags) {
2547 return !valueIsKnownNeverF32Denorm(Src) &&
2548 DAG.getMachineFunction()
2551}
2552
2554 SDValue Src,
2555 SDNodeFlags Flags) const {
2556 SDLoc SL(Src);
2557 EVT VT = Src.getValueType();
2559 SDValue SmallestNormal =
2560 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2561
2562 // Want to scale denormals up, but negatives and 0 work just as well on the
2563 // scaled path.
2564 SDValue IsLtSmallestNormal = DAG.getSetCC(
2565 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2566 SmallestNormal, ISD::SETOLT);
2567
2568 return IsLtSmallestNormal;
2569}
2570
2572 SDNodeFlags Flags) const {
2573 SDLoc SL(Src);
2574 EVT VT = Src.getValueType();
2576 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2577
2578 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2579 SDValue IsFinite = DAG.getSetCC(
2580 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2581 Inf, ISD::SETOLT);
2582 return IsFinite;
2583}
2584
2585/// If denormal handling is required return the scaled input to FLOG2, and the
2586/// check for denormal range. Otherwise, return null values.
2587std::pair<SDValue, SDValue>
2589 SDValue Src, SDNodeFlags Flags) const {
2590 if (!needsDenormHandlingF32(DAG, Src, Flags))
2591 return {};
2592
2593 MVT VT = MVT::f32;
2594 const fltSemantics &Semantics = APFloat::IEEEsingle();
2595 SDValue SmallestNormal =
2596 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2597
2598 SDValue IsLtSmallestNormal = DAG.getSetCC(
2599 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2600 SmallestNormal, ISD::SETOLT);
2601
2602 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2603 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2604 SDValue ScaleFactor =
2605 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2606
2607 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2608 return {ScaledInput, IsLtSmallestNormal};
2609}
2610
2612 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2613 // If we have to handle denormals, scale up the input and adjust the result.
2614
2615 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2616 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2617
2618 SDLoc SL(Op);
2619 EVT VT = Op.getValueType();
2620 SDValue Src = Op.getOperand(0);
2621 SDNodeFlags Flags = Op->getFlags();
2622
2623 if (VT == MVT::f16) {
2624 // Nothing in half is a denormal when promoted to f32.
2625 assert(!Subtarget->has16BitInsts());
2626 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2627 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2628 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2629 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2630 }
2631
2632 auto [ScaledInput, IsLtSmallestNormal] =
2633 getScaledLogInput(DAG, SL, Src, Flags);
2634 if (!ScaledInput)
2635 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2636
2637 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2638
2639 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2640 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2641 SDValue ResultOffset =
2642 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2643 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2644}
2645
2646static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2647 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2648 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2649 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2650}
2651
2653 SelectionDAG &DAG) const {
2654 SDValue X = Op.getOperand(0);
2655 EVT VT = Op.getValueType();
2656 SDNodeFlags Flags = Op->getFlags();
2657 SDLoc DL(Op);
2658
2659 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2660 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2661
2662 const auto &Options = getTargetMachine().Options;
2663 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2664 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2665
2666 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2667 // Log and multiply in f32 is good enough for f16.
2668 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2669 }
2670
2671 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2672 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2673 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2674 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2675 }
2676
2677 return Lowered;
2678 }
2679
2680 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2681 if (ScaledInput)
2682 X = ScaledInput;
2683
2684 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2685
2686 SDValue R;
2687 if (Subtarget->hasFastFMAF32()) {
2688 // c+cc are ln(2)/ln(10) to more than 49 bits
2689 const float c_log10 = 0x1.344134p-2f;
2690 const float cc_log10 = 0x1.09f79ep-26f;
2691
2692 // c + cc is ln(2) to more than 49 bits
2693 const float c_log = 0x1.62e42ep-1f;
2694 const float cc_log = 0x1.efa39ep-25f;
2695
2696 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2697 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2698
2699 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2700 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2701 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2702 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2703 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2704 } else {
2705 // ch+ct is ln(2)/ln(10) to more than 36 bits
2706 const float ch_log10 = 0x1.344000p-2f;
2707 const float ct_log10 = 0x1.3509f6p-18f;
2708
2709 // ch + ct is ln(2) to more than 36 bits
2710 const float ch_log = 0x1.62e000p-1f;
2711 const float ct_log = 0x1.0bfbe8p-15f;
2712
2713 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2714 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2715
2716 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2717 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2718 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2719 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2720 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2721
2722 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2723 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2724 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2725 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2726 }
2727
2728 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2729 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2730
2731 // TODO: Check if known finite from source value.
2732 if (!IsFiniteOnly) {
2733 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2734 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2735 }
2736
2737 if (IsScaled) {
2738 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2739 SDValue ShiftK =
2740 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2741 SDValue Shift =
2742 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2743 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2744 }
2745
2746 return R;
2747}
2748
2750 return LowerFLOGCommon(Op, DAG);
2751}
2752
2753// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2754// promote f16 operation.
2756 SelectionDAG &DAG, bool IsLog10,
2757 SDNodeFlags Flags) const {
2758 EVT VT = Src.getValueType();
2759 unsigned LogOp =
2760 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2761
2762 double Log2BaseInverted =
2764
2765 if (VT == MVT::f32) {
2766 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2767 if (ScaledInput) {
2768 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2769 SDValue ScaledResultOffset =
2770 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2771
2772 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2773
2774 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2775 ScaledResultOffset, Zero, Flags);
2776
2777 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2778
2779 if (Subtarget->hasFastFMAF32())
2780 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2781 Flags);
2782 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2783 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2784 }
2785 }
2786
2787 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2788 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2789
2790 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2791 Flags);
2792}
2793
2795 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2796 // If we have to handle denormals, scale up the input and adjust the result.
2797
2798 SDLoc SL(Op);
2799 EVT VT = Op.getValueType();
2800 SDValue Src = Op.getOperand(0);
2801 SDNodeFlags Flags = Op->getFlags();
2802
2803 if (VT == MVT::f16) {
2804 // Nothing in half is a denormal when promoted to f32.
2805 assert(!Subtarget->has16BitInsts());
2806 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2807 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2808 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2809 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2810 }
2811
2812 assert(VT == MVT::f32);
2813
2814 if (!needsDenormHandlingF32(DAG, Src, Flags))
2815 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2816
2817 // bool needs_scaling = x < -0x1.f80000p+6f;
2818 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2819
2820 // -nextafter(128.0, -1)
2821 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2822
2823 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2824
2825 SDValue NeedsScaling =
2826 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2827
2828 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2829 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2830
2831 SDValue AddOffset =
2832 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2833
2834 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2835 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2836
2837 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2838 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2839 SDValue ResultScale =
2840 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2841
2842 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2843}
2844
2846 SelectionDAG &DAG,
2847 SDNodeFlags Flags) const {
2848 EVT VT = X.getValueType();
2849 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2850
2851 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2852 // exp2(M_LOG2E_F * f);
2853 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2854 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2855 : (unsigned)ISD::FEXP2,
2856 SL, VT, Mul, Flags);
2857 }
2858
2859 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2860
2861 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2862 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2863
2864 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2865
2866 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2867
2868 SDValue AdjustedX =
2869 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2870
2871 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2872
2873 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2874
2875 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2876 SDValue AdjustedResult =
2877 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2878
2879 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2880 Flags);
2881}
2882
2883/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2884/// handled correctly.
2886 SelectionDAG &DAG,
2887 SDNodeFlags Flags) const {
2888 const EVT VT = X.getValueType();
2889 const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
2890
2891 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2892 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
2893 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2894 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2895
2896 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
2897 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2898 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
2899 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2900 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
2901 }
2902
2903 // bool s = x < -0x1.2f7030p+5f;
2904 // x += s ? 0x1.0p+5f : 0.0f;
2905 // exp10 = exp2(x * 0x1.a92000p+1f) *
2906 // exp2(x * 0x1.4f0978p-11f) *
2907 // (s ? 0x1.9f623ep-107f : 1.0f);
2908
2909 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2910
2911 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
2912 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2913
2914 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
2915 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2916 SDValue AdjustedX =
2917 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2918
2919 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
2920 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
2921
2922 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
2923 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
2924 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
2925 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
2926
2927 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
2928
2929 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
2930 SDValue AdjustedResult =
2931 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
2932
2933 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
2934 Flags);
2935}
2936
2938 EVT VT = Op.getValueType();
2939 SDLoc SL(Op);
2940 SDValue X = Op.getOperand(0);
2941 SDNodeFlags Flags = Op->getFlags();
2942 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
2943
2944 if (VT.getScalarType() == MVT::f16) {
2945 // v_exp_f16 (fmul x, log2e)
2946 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2947 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2948
2949 if (VT.isVector())
2950 return SDValue();
2951
2952 // exp(f16 x) ->
2953 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2954
2955 // Nothing in half is a denormal when promoted to f32.
2956 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2957 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2958 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2959 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2960 }
2961
2962 assert(VT == MVT::f32);
2963
2964 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2965 // library behavior. Also, is known-not-daz source sufficient?
2966 if (allowApproxFunc(DAG, Flags)) {
2967 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
2968 : lowerFEXPUnsafe(X, SL, DAG, Flags);
2969 }
2970
2971 // Algorithm:
2972 //
2973 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2974 //
2975 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2976 // n = 64*m + j, 0 <= j < 64
2977 //
2978 // e^x = 2^((64*m + j + f)/64)
2979 // = (2^m) * (2^(j/64)) * 2^(f/64)
2980 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2981 //
2982 // f = x*(64/ln(2)) - n
2983 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
2984 //
2985 // e^x = (2^m) * (2^(j/64)) * e^r
2986 //
2987 // (2^(j/64)) is precomputed
2988 //
2989 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2990 // e^r = 1 + q
2991 //
2992 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2993 //
2994 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
2995 SDNodeFlags FlagsNoContract = Flags;
2996 FlagsNoContract.setAllowContract(false);
2997
2998 SDValue PH, PL;
2999 if (Subtarget->hasFastFMAF32()) {
3000 const float c_exp = numbers::log2ef;
3001 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3002 const float c_exp10 = 0x1.a934f0p+1f;
3003 const float cc_exp10 = 0x1.2f346ep-24f;
3004
3005 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3006 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3007
3008 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3009 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3010 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3011 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3012 } else {
3013 const float ch_exp = 0x1.714000p+0f;
3014 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3015
3016 const float ch_exp10 = 0x1.a92000p+1f;
3017 const float cl_exp10 = 0x1.4f0978p-11f;
3018
3019 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3020 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3021
3022 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3023 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3024 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3025 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3026 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3027
3028 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3029
3030 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3031 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3032 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3033 }
3034
3035 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3036
3037 // It is unsafe to contract this fsub into the PH multiply.
3038 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3039
3040 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3041 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3042 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3043
3044 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3045
3046 SDValue UnderflowCheckConst =
3047 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3048
3049 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3050 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3051 SDValue Underflow =
3052 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3053
3054 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3055 const auto &Options = getTargetMachine().Options;
3056
3057 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
3058 SDValue OverflowCheckConst =
3059 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3060 SDValue Overflow =
3061 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3062 SDValue Inf =
3064 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3065 }
3066
3067 return R;
3068}
3069
3070static bool isCtlzOpc(unsigned Opc) {
3071 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3072}
3073
3074static bool isCttzOpc(unsigned Opc) {
3075 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3076}
3077
3079 SelectionDAG &DAG) const {
3080 auto SL = SDLoc(Op);
3081 auto Arg = Op.getOperand(0u);
3082 auto ResultVT = Op.getValueType();
3083
3084 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3085 return {};
3086
3087 assert(isCtlzOpc(Op.getOpcode()));
3088 assert(ResultVT == Arg.getValueType());
3089
3090 auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
3091 auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3092 auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
3093 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal);
3094 NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
3095 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3096}
3097
3099 SDLoc SL(Op);
3100 SDValue Src = Op.getOperand(0);
3101
3102 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3103 bool Ctlz = isCtlzOpc(Op.getOpcode());
3104 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3105
3106 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3107 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3108 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3109
3110 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3111 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3112 // (cttz hi:lo) -> (umin (ffbl src), 32)
3113 // (ctlz_zero_undef src) -> (ffbh src)
3114 // (cttz_zero_undef src) -> (ffbl src)
3115
3116 // 64-bit scalar version produce 32-bit result
3117 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3118 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3119 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3120 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3121 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3122 if (!ZeroUndef) {
3123 const SDValue ConstVal = DAG.getConstant(
3124 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3125 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3126 }
3127 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3128 }
3129
3130 SDValue Lo, Hi;
3131 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3132
3133 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3134 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3135
3136 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3137 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3138 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3139 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3140
3141 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3142 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3143 if (Ctlz)
3144 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3145 else
3146 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3147
3148 SDValue NewOpr;
3149 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3150 if (!ZeroUndef) {
3151 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3152 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3153 }
3154
3155 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3156}
3157
3159 bool Signed) const {
3160 // The regular method converting a 64-bit integer to float roughly consists of
3161 // 2 steps: normalization and rounding. In fact, after normalization, the
3162 // conversion from a 64-bit integer to a float is essentially the same as the
3163 // one from a 32-bit integer. The only difference is that it has more
3164 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3165 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3166 // converted into the correct float number. The basic steps for the unsigned
3167 // conversion are illustrated in the following pseudo code:
3168 //
3169 // f32 uitofp(i64 u) {
3170 // i32 hi, lo = split(u);
3171 // // Only count the leading zeros in hi as we have native support of the
3172 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3173 // // reduced to a 32-bit one automatically.
3174 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3175 // u <<= shamt;
3176 // hi, lo = split(u);
3177 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3178 // // convert it as a 32-bit integer and scale the result back.
3179 // return uitofp(hi) * 2^(32 - shamt);
3180 // }
3181 //
3182 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3183 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3184 // converted instead followed by negation based its sign bit.
3185
3186 SDLoc SL(Op);
3187 SDValue Src = Op.getOperand(0);
3188
3189 SDValue Lo, Hi;
3190 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3191 SDValue Sign;
3192 SDValue ShAmt;
3193 if (Signed && Subtarget->isGCN()) {
3194 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3195 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3196 // account. That is, the maximal shift is
3197 // - 32 if Lo and Hi have opposite signs;
3198 // - 33 if Lo and Hi have the same sign.
3199 //
3200 // Or, MaxShAmt = 33 + OppositeSign, where
3201 //
3202 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3203 // - -1 if Lo and Hi have opposite signs; and
3204 // - 0 otherwise.
3205 //
3206 // All in all, ShAmt is calculated as
3207 //
3208 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3209 //
3210 // or
3211 //
3212 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3213 //
3214 // to reduce the critical path.
3215 SDValue OppositeSign = DAG.getNode(
3216 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3217 DAG.getConstant(31, SL, MVT::i32));
3218 SDValue MaxShAmt =
3219 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3220 OppositeSign);
3221 // Count the leading sign bits.
3222 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3223 // Different from unsigned conversion, the shift should be one bit less to
3224 // preserve the sign bit.
3225 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3226 DAG.getConstant(1, SL, MVT::i32));
3227 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3228 } else {
3229 if (Signed) {
3230 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3231 // absolute value first.
3232 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3233 DAG.getConstant(63, SL, MVT::i64));
3234 SDValue Abs =
3235 DAG.getNode(ISD::XOR, SL, MVT::i64,
3236 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3237 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3238 }
3239 // Count the leading zeros.
3240 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3241 // The shift amount for signed integers is [0, 32].
3242 }
3243 // Normalize the given 64-bit integer.
3244 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3245 // Split it again.
3246 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3247 // Calculate the adjust bit for rounding.
3248 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3249 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3250 DAG.getConstant(1, SL, MVT::i32), Lo);
3251 // Get the 32-bit normalized integer.
3252 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3253 // Convert the normalized 32-bit integer into f32.
3254 unsigned Opc =
3255 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3256 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3257
3258 // Finally, need to scale back the converted floating number as the original
3259 // 64-bit integer is converted as a 32-bit one.
3260 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3261 ShAmt);
3262 // On GCN, use LDEXP directly.
3263 if (Subtarget->isGCN())
3264 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3265
3266 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3267 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3268 // exponent is enough to avoid overflowing into the sign bit.
3269 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3270 DAG.getConstant(23, SL, MVT::i32));
3271 SDValue IVal =
3272 DAG.getNode(ISD::ADD, SL, MVT::i32,
3273 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3274 if (Signed) {
3275 // Set the sign bit.
3276 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3277 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3278 DAG.getConstant(31, SL, MVT::i32));
3279 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3280 }
3281 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3282}
3283
3285 bool Signed) const {
3286 SDLoc SL(Op);
3287 SDValue Src = Op.getOperand(0);
3288
3289 SDValue Lo, Hi;
3290 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3291
3293 SL, MVT::f64, Hi);
3294
3295 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3296
3297 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3298 DAG.getConstant(32, SL, MVT::i32));
3299 // TODO: Should this propagate fast-math-flags?
3300 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3301}
3302
3304 SelectionDAG &DAG) const {
3305 // TODO: Factor out code common with LowerSINT_TO_FP.
3306 EVT DestVT = Op.getValueType();
3307 SDValue Src = Op.getOperand(0);
3308 EVT SrcVT = Src.getValueType();
3309
3310 if (SrcVT == MVT::i16) {
3311 if (DestVT == MVT::f16)
3312 return Op;
3313 SDLoc DL(Op);
3314
3315 // Promote src to i32
3316 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3317 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3318 }
3319
3320 if (DestVT == MVT::bf16) {
3321 SDLoc SL(Op);
3322 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3323 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3324 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3325 }
3326
3327 if (SrcVT != MVT::i64)
3328 return Op;
3329
3330 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3331 SDLoc DL(Op);
3332
3333 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3334 SDValue FPRoundFlag =
3335 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3336 SDValue FPRound =
3337 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3338
3339 return FPRound;
3340 }
3341
3342 if (DestVT == MVT::f32)
3343 return LowerINT_TO_FP32(Op, DAG, false);
3344
3345 assert(DestVT == MVT::f64);
3346 return LowerINT_TO_FP64(Op, DAG, false);
3347}
3348
3350 SelectionDAG &DAG) const {
3351 EVT DestVT = Op.getValueType();
3352
3353 SDValue Src = Op.getOperand(0);
3354 EVT SrcVT = Src.getValueType();
3355
3356 if (SrcVT == MVT::i16) {
3357 if (DestVT == MVT::f16)
3358 return Op;
3359
3360 SDLoc DL(Op);
3361 // Promote src to i32
3362 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3363 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3364 }
3365
3366 if (DestVT == MVT::bf16) {
3367 SDLoc SL(Op);
3368 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3369 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3370 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3371 }
3372
3373 if (SrcVT != MVT::i64)
3374 return Op;
3375
3376 // TODO: Factor out code common with LowerUINT_TO_FP.
3377
3378 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3379 SDLoc DL(Op);
3380 SDValue Src = Op.getOperand(0);
3381
3382 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3383 SDValue FPRoundFlag =
3384 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3385 SDValue FPRound =
3386 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3387
3388 return FPRound;
3389 }
3390
3391 if (DestVT == MVT::f32)
3392 return LowerINT_TO_FP32(Op, DAG, true);
3393
3394 assert(DestVT == MVT::f64);
3395 return LowerINT_TO_FP64(Op, DAG, true);
3396}
3397
3399 bool Signed) const {
3400 SDLoc SL(Op);
3401
3402 SDValue Src = Op.getOperand(0);
3403 EVT SrcVT = Src.getValueType();
3404
3405 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3406
3407 // The basic idea of converting a floating point number into a pair of 32-bit
3408 // integers is illustrated as follows:
3409 //
3410 // tf := trunc(val);
3411 // hif := floor(tf * 2^-32);
3412 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3413 // hi := fptoi(hif);
3414 // lo := fptoi(lof);
3415 //
3416 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3417 SDValue Sign;
3418 if (Signed && SrcVT == MVT::f32) {
3419 // However, a 32-bit floating point number has only 23 bits mantissa and
3420 // it's not enough to hold all the significant bits of `lof` if val is
3421 // negative. To avoid the loss of precision, We need to take the absolute
3422 // value after truncating and flip the result back based on the original
3423 // signedness.
3424 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3425 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3426 DAG.getConstant(31, SL, MVT::i32));
3427 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3428 }
3429
3430 SDValue K0, K1;
3431 if (SrcVT == MVT::f64) {
3432 K0 = DAG.getConstantFP(
3433 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3434 SrcVT);
3435 K1 = DAG.getConstantFP(
3436 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3437 SrcVT);
3438 } else {
3439 K0 = DAG.getConstantFP(
3440 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3441 K1 = DAG.getConstantFP(
3442 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3443 }
3444 // TODO: Should this propagate fast-math-flags?
3445 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3446
3447 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3448
3449 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3450
3451 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3453 SL, MVT::i32, FloorMul);
3454 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3455
3456 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3457 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3458
3459 if (Signed && SrcVT == MVT::f32) {
3460 assert(Sign);
3461 // Flip the result based on the signedness, which is either all 0s or 1s.
3462 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3463 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3464 // r := xor(r, sign) - sign;
3465 Result =
3466 DAG.getNode(ISD::SUB, SL, MVT::i64,
3467 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3468 }
3469
3470 return Result;
3471}
3472
3474 SDLoc DL(Op);
3475 SDValue N0 = Op.getOperand(0);
3476
3477 // Convert to target node to get known bits
3478 if (N0.getValueType() == MVT::f32)
3479 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3480
3481 if (getTargetMachine().Options.UnsafeFPMath) {
3482 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3483 return SDValue();
3484 }
3485
3486 assert(N0.getSimpleValueType() == MVT::f64);
3487
3488 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3489 const unsigned ExpMask = 0x7ff;
3490 const unsigned ExpBiasf64 = 1023;
3491 const unsigned ExpBiasf16 = 15;
3492 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3493 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3494 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3495 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3496 DAG.getConstant(32, DL, MVT::i64));
3497 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3498 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3499 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3500 DAG.getConstant(20, DL, MVT::i64));
3501 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3502 DAG.getConstant(ExpMask, DL, MVT::i32));
3503 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3504 // add the f16 bias (15) to get the biased exponent for the f16 format.
3505 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3506 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3507
3508 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3509 DAG.getConstant(8, DL, MVT::i32));
3510 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3511 DAG.getConstant(0xffe, DL, MVT::i32));
3512
3513 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3514 DAG.getConstant(0x1ff, DL, MVT::i32));
3515 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3516
3517 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3518 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3519
3520 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3521 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3522 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3523 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3524
3525 // N = M | (E << 12);
3526 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3527 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3528 DAG.getConstant(12, DL, MVT::i32)));
3529
3530 // B = clamp(1-E, 0, 13);
3531 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3532 One, E);
3533 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3534 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3535 DAG.getConstant(13, DL, MVT::i32));
3536
3537 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3538 DAG.getConstant(0x1000, DL, MVT::i32));
3539
3540 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3541 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3542 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3543 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3544
3545 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3546 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3547 DAG.getConstant(0x7, DL, MVT::i32));
3548 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3549 DAG.getConstant(2, DL, MVT::i32));
3550 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3551 One, Zero, ISD::SETEQ);
3552 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3553 One, Zero, ISD::SETGT);
3554 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3555 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3556
3557 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3558 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3559 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3560 I, V, ISD::SETEQ);
3561
3562 // Extract the sign bit.
3563 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3564 DAG.getConstant(16, DL, MVT::i32));
3565 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3566 DAG.getConstant(0x8000, DL, MVT::i32));
3567
3568 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3569 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3570}
3571
3573 SelectionDAG &DAG) const {
3574 SDValue Src = Op.getOperand(0);
3575 unsigned OpOpcode = Op.getOpcode();
3576 EVT SrcVT = Src.getValueType();
3577 EVT DestVT = Op.getValueType();
3578
3579 // Will be selected natively
3580 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3581 return Op;
3582
3583 if (SrcVT == MVT::bf16) {
3584 SDLoc DL(Op);
3585 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3586 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3587 }
3588
3589 // Promote i16 to i32
3590 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3591 SDLoc DL(Op);
3592
3593 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3594 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3595 }
3596
3597 if (DestVT != MVT::i64)
3598 return Op;
3599
3600 if (SrcVT == MVT::f16 ||
3601 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3602 SDLoc DL(Op);
3603
3604 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3605 unsigned Ext =
3607 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3608 }
3609
3610 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3611 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3612
3613 return SDValue();
3614}
3615
3617 SelectionDAG &DAG) const {
3618 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3619 MVT VT = Op.getSimpleValueType();
3620 MVT ScalarVT = VT.getScalarType();
3621
3622 assert(VT.isVector());
3623
3624 SDValue Src = Op.getOperand(0);
3625 SDLoc DL(Op);
3626
3627 // TODO: Don't scalarize on Evergreen?
3628 unsigned NElts = VT.getVectorNumElements();
3630 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3631
3632 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3633 for (unsigned I = 0; I < NElts; ++I)
3634 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3635
3636 return DAG.getBuildVector(VT, DL, Args);
3637}
3638
3639//===----------------------------------------------------------------------===//
3640// Custom DAG optimizations
3641//===----------------------------------------------------------------------===//
3642
3643static bool isU24(SDValue Op, SelectionDAG &DAG) {
3644 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3645}
3646
3647static bool isI24(SDValue Op, SelectionDAG &DAG) {
3648 EVT VT = Op.getValueType();
3649 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3650 // as unsigned 24-bit values.
3652}
3653
3656 SelectionDAG &DAG = DCI.DAG;
3657 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3658 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3659
3660 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3661 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3662 unsigned NewOpcode = Node24->getOpcode();
3663 if (IsIntrin) {
3664 unsigned IID = Node24->getConstantOperandVal(0);
3665 switch (IID) {
3666 case Intrinsic::amdgcn_mul_i24:
3667 NewOpcode = AMDGPUISD::MUL_I24;
3668 break;
3669 case Intrinsic::amdgcn_mul_u24:
3670 NewOpcode = AMDGPUISD::MUL_U24;
3671 break;
3672 case Intrinsic::amdgcn_mulhi_i24:
3673 NewOpcode = AMDGPUISD::MULHI_I24;
3674 break;
3675 case Intrinsic::amdgcn_mulhi_u24:
3676 NewOpcode = AMDGPUISD::MULHI_U24;
3677 break;
3678 default:
3679 llvm_unreachable("Expected 24-bit mul intrinsic");
3680 }
3681 }
3682
3683 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3684
3685 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3686 // the operands to have other uses, but will only perform simplifications that
3687 // involve bypassing some nodes for this user.
3688 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3689 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3690 if (DemandedLHS || DemandedRHS)
3691 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3692 DemandedLHS ? DemandedLHS : LHS,
3693 DemandedRHS ? DemandedRHS : RHS);
3694
3695 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3696 // operands if this node is the only user.
3697 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3698 return SDValue(Node24, 0);
3699 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3700 return SDValue(Node24, 0);
3701
3702 return SDValue();
3703}
3704
3705template <typename IntTy>
3707 uint32_t Width, const SDLoc &DL) {
3708 if (Width + Offset < 32) {
3709 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3710 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3711 return DAG.getConstant(Result, DL, MVT::i32);
3712 }
3713
3714 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3715}
3716
3717static bool hasVolatileUser(SDNode *Val) {
3718 for (SDNode *U : Val->uses()) {
3719 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3720 if (M->isVolatile())
3721 return true;
3722 }
3723 }
3724
3725 return false;
3726}
3727
3729 // i32 vectors are the canonical memory type.
3730 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3731 return false;
3732
3733 if (!VT.isByteSized())
3734 return false;
3735
3736 unsigned Size = VT.getStoreSize();
3737
3738 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3739 return false;
3740
3741 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3742 return false;
3743
3744 return true;
3745}
3746
3747// Replace load of an illegal type with a store of a bitcast to a friendlier
3748// type.
3750 DAGCombinerInfo &DCI) const {
3751 if (!DCI.isBeforeLegalize())
3752 return SDValue();
3753
3754 LoadSDNode *LN = cast<LoadSDNode>(N);
3755 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3756 return SDValue();
3757
3758 SDLoc SL(N);
3759 SelectionDAG &DAG = DCI.DAG;
3760 EVT VT = LN->getMemoryVT();
3761
3762 unsigned Size = VT.getStoreSize();
3763 Align Alignment = LN->getAlign();
3764 if (Alignment < Size && isTypeLegal(VT)) {
3765 unsigned IsFast;
3766 unsigned AS = LN->getAddressSpace();
3767
3768 // Expand unaligned loads earlier than legalization. Due to visitation order
3769 // problems during legalization, the emitted instructions to pack and unpack
3770 // the bytes again are not eliminated in the case of an unaligned copy.
3772 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3773 if (VT.isVector())
3774 return SplitVectorLoad(SDValue(LN, 0), DAG);
3775
3776 SDValue Ops[2];
3777 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3778
3779 return DAG.getMergeValues(Ops, SDLoc(N));
3780 }
3781
3782 if (!IsFast)
3783 return SDValue();
3784 }
3785
3786 if (!shouldCombineMemoryType(VT))
3787 return SDValue();
3788
3789 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3790
3791 SDValue NewLoad
3792 = DAG.getLoad(NewVT, SL, LN->getChain(),
3793 LN->getBasePtr(), LN->getMemOperand());
3794
3795 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3796 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3797 return SDValue(N, 0);
3798}
3799
3800// Replace store of an illegal type with a store of a bitcast to a friendlier
3801// type.
3803 DAGCombinerInfo &DCI) const {
3804 if (!DCI.isBeforeLegalize())
3805 return SDValue();
3806
3807 StoreSDNode *SN = cast<StoreSDNode>(N);
3808 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3809 return SDValue();
3810
3811 EVT VT = SN->getMemoryVT();
3812 unsigned Size = VT.getStoreSize();
3813
3814 SDLoc SL(N);
3815 SelectionDAG &DAG = DCI.DAG;
3816 Align Alignment = SN->getAlign();
3817 if (Alignment < Size && isTypeLegal(VT)) {
3818 unsigned IsFast;
3819 unsigned AS = SN->getAddressSpace();
3820
3821 // Expand unaligned stores earlier than legalization. Due to visitation
3822 // order problems during legalization, the emitted instructions to pack and
3823 // unpack the bytes again are not eliminated in the case of an unaligned
3824 // copy.
3826 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3827 if (VT.isVector())
3828 return SplitVectorStore(SDValue(SN, 0), DAG);
3829
3830 return expandUnalignedStore(SN, DAG);
3831 }
3832
3833 if (!IsFast)
3834 return SDValue();
3835 }
3836
3837 if (!shouldCombineMemoryType(VT))
3838 return SDValue();
3839
3840 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3841 SDValue Val = SN->getValue();
3842
3843 //DCI.AddToWorklist(Val.getNode());
3844
3845 bool OtherUses = !Val.hasOneUse();
3846 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3847 if (OtherUses) {
3848 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3849 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3850 }
3851
3852 return DAG.getStore(SN->getChain(), SL, CastVal,
3853 SN->getBasePtr(), SN->getMemOperand());
3854}
3855
3856// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3857// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3858// issues.
3860 DAGCombinerInfo &DCI) const {
3861 SelectionDAG &DAG = DCI.DAG;
3862 SDValue N0 = N->getOperand(0);
3863
3864 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3865 // (vt2 (truncate (assertzext vt0:x, vt1)))
3866 if (N0.getOpcode() == ISD::TRUNCATE) {
3867 SDValue N1 = N->getOperand(1);
3868 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3869 SDLoc SL(N);
3870
3871 SDValue Src = N0.getOperand(0);
3872 EVT SrcVT = Src.getValueType();
3873 if (SrcVT.bitsGE(ExtVT)) {
3874 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3875 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3876 }
3877 }
3878
3879 return SDValue();
3880}
3881
3883 SDNode *