LLVM 18.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
24#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47}
48
51}
52
54 // In order for this to be a signed 24-bit value, bit 23, must
55 // be a sign bit.
56 return DAG.ComputeMaxSignificantBits(Op);
57}
58
60 const AMDGPUSubtarget &STI)
61 : TargetLowering(TM), Subtarget(&STI) {
62 // Lower floating point store/load to integer store/load to reduce the number
63 // of patterns in tablegen.
65 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
66
68 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
69
71 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
72
74 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
75
77 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
78
80 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
81
83 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
84
86 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
87
89 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
90
92 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
93
95 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
96
98 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
99
100 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
101 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
102
103 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
104 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
105
107 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
108
110 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
111
113 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
114
116 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
117
119 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
120
122 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
123
125 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
126
128 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
129
131 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
132
134 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
135
136 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
137 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
138
139 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
140 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
141
143 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
144
145 // There are no 64-bit extloads. These should be done as a 32-bit extload and
146 // an extension to 64-bit.
147 for (MVT VT : MVT::integer_valuetypes())
149 Expand);
150
151 for (MVT VT : MVT::integer_valuetypes()) {
152 if (VT == MVT::i64)
153 continue;
154
155 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
156 setLoadExtAction(Op, VT, MVT::i1, Promote);
157 setLoadExtAction(Op, VT, MVT::i8, Legal);
158 setLoadExtAction(Op, VT, MVT::i16, Legal);
159 setLoadExtAction(Op, VT, MVT::i32, Expand);
160 }
161 }
162
164 for (auto MemVT :
165 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
167 Expand);
168
169 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
170 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
171 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
172 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
173 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
174 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
175 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
176 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
177
178 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
179 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
180 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
181 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
182 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
183 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
184
185 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
186 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
187 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
188 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
189 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
190 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
191 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
192
194 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
195
197 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
198
200 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
201
203 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
204
206 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
207
209 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
210
212 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
213
215 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
216
218 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
219
221 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
222
224 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
225
227 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
228
230 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
231
233 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
234
236 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
237
239 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
240
242 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
243
245 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
246
248 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
249
251 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
252
254 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
255
257 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
258
260 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
261
263 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
264
266 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
267
269 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
270
272 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
273
274 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
275 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
276 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
277 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
278
279 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
280 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
281 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
282 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
283
284 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
285 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
286 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
287 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
288 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
289 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
290 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
291 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
292
293 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
294 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
295 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
296
297 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
298 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
299
300 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
301 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
302 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
303 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
304
305 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
306 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
307 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
308 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
309
310 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
311 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
312
313 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
314 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
315 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
316 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
317 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
318 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
319 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
320
321 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
322 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
323
325
326 // For R600, this is totally unsupported, just custom lower to produce an
327 // error.
329
330 // Library functions. These default to Expand, but we have instructions
331 // for them.
334 MVT::f32, Legal);
335
337 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
338
340 Custom);
341
342 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
343
344 setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom);
345
346 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
347
348 if (Subtarget->has16BitInsts())
349 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
350 else {
351 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
353 }
354
356
357 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
358 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
359 // default unless marked custom/legal.
362 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16, MVT::v2f32, MVT::v3f32,
363 MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
364 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64, MVT::v16f64},
365 Custom);
366
367 // Expand to fneg + fadd.
369
371 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
372 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
373 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
374 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
375 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
376 Custom);
379 {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32,
380 MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32,
381 MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32,
382 MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, MVT::v9i32,
383 MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32,
384 MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
385 MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
386 MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
387 MVT::v16f64, MVT::v16i64},
388 Custom);
389
391 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
392
393 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
394 for (MVT VT : ScalarIntVTs) {
395 // These should use [SU]DIVREM, so set them to expand
397 Expand);
398
399 // GPU does not have divrem function for signed or unsigned.
401
402 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
404
406
407 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
409 }
410
411 // The hardware supports 32-bit FSHR, but not FSHL.
413
414 // The hardware supports 32-bit ROTR, but not ROTL.
415 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
417
419
423 MVT::i64, Custom);
425
427 Legal);
428
431 MVT::i64, Custom);
432
433 static const MVT::SimpleValueType VectorIntTypes[] = {
434 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
435 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
436
437 for (MVT VT : VectorIntTypes) {
438 // Expand the following operations for the current type by default.
450 ISD::SETCC},
451 VT, Expand);
452 }
453
454 static const MVT::SimpleValueType FloatVectorTypes[] = {
455 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
456 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
457
458 for (MVT VT : FloatVectorTypes) {
468 VT, Expand);
469 }
470
471 // This causes using an unrolled select operation rather than expansion with
472 // bit operations. This is in general better, but the alternative using BFI
473 // instructions may be better if the select sources are SGPRs.
475 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
476
478 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
479
481 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
482
484 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
485
487 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
488
490 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
491
493 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
494
496 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
497
499 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
500
502 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
503
504 // There are no libcalls of any kind.
505 for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
506 setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
507
509 setJumpIsExpensive(true);
510
511 // FIXME: This is only partially true. If we have to do vector compares, any
512 // SGPR pair can be a condition register. If we have a uniform condition, we
513 // are better off doing SALU operations, where there is only one SCC. For now,
514 // we don't have a way of knowing during instruction selection if a condition
515 // will be uniform and we always use vector compares. Assume we are using
516 // vector compares until that is fixed.
518
521
523
524 // We want to find all load dependencies for long chains of stores to enable
525 // merging into very wide vectors. The problem is with vectors with > 4
526 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
527 // vectors are a legal type, even though we have to split the loads
528 // usually. When we can more precisely specify load legality per address
529 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
530 // smarter so that they can figure out what to do in 2 iterations without all
531 // N > 4 stores on the same chain.
533
534 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
535 // about these during lowering.
536 MaxStoresPerMemcpy = 0xffffffff;
537 MaxStoresPerMemmove = 0xffffffff;
538 MaxStoresPerMemset = 0xffffffff;
539
540 // The expansion for 64-bit division is enormous.
542 addBypassSlowDiv(64, 32);
543
554}
555
557 if (getTargetMachine().Options.NoSignedZerosFPMath)
558 return true;
559
560 const auto Flags = Op.getNode()->getFlags();
561 if (Flags.hasNoSignedZeros())
562 return true;
563
564 return false;
565}
566
567//===----------------------------------------------------------------------===//
568// Target Information
569//===----------------------------------------------------------------------===//
570
572static bool fnegFoldsIntoOpcode(unsigned Opc) {
573 switch (Opc) {
574 case ISD::FADD:
575 case ISD::FSUB:
576 case ISD::FMUL:
577 case ISD::FMA:
578 case ISD::FMAD:
579 case ISD::FMINNUM:
580 case ISD::FMAXNUM:
583 case ISD::SELECT:
584 case ISD::FSIN:
585 case ISD::FTRUNC:
586 case ISD::FRINT:
587 case ISD::FNEARBYINT:
589 case AMDGPUISD::RCP:
596 case AMDGPUISD::FMED3:
597 // TODO: handle llvm.amdgcn.fma.legacy
598 return true;
599 case ISD::BITCAST:
600 llvm_unreachable("bitcast is special cased");
601 default:
602 return false;
603 }
604}
605
606static bool fnegFoldsIntoOp(const SDNode *N) {
607 unsigned Opc = N->getOpcode();
608 if (Opc == ISD::BITCAST) {
609 // TODO: Is there a benefit to checking the conditions performFNegCombine
610 // does? We don't for the other cases.
611 SDValue BCSrc = N->getOperand(0);
612 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
613 return BCSrc.getNumOperands() == 2 &&
614 BCSrc.getOperand(1).getValueSizeInBits() == 32;
615 }
616
617 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
618 }
619
620 return fnegFoldsIntoOpcode(Opc);
621}
622
623/// \p returns true if the operation will definitely need to use a 64-bit
624/// encoding, and thus will use a VOP3 encoding regardless of the source
625/// modifiers.
627static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
628 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
629 VT == MVT::f64;
630}
631
632/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
633/// type for ISD::SELECT.
635static bool selectSupportsSourceMods(const SDNode *N) {
636 // TODO: Only applies if select will be vector
637 return N->getValueType(0) == MVT::f32;
638}
639
640// Most FP instructions support source modifiers, but this could be refined
641// slightly.
643static bool hasSourceMods(const SDNode *N) {
644 if (isa<MemSDNode>(N))
645 return false;
646
647 switch (N->getOpcode()) {
648 case ISD::CopyToReg:
649 case ISD::FDIV:
650 case ISD::FREM:
651 case ISD::INLINEASM:
655
656 // TODO: Should really be looking at the users of the bitcast. These are
657 // problematic because bitcasts are used to legalize all stores to integer
658 // types.
659 case ISD::BITCAST:
660 return false;
662 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
663 case Intrinsic::amdgcn_interp_p1:
664 case Intrinsic::amdgcn_interp_p2:
665 case Intrinsic::amdgcn_interp_mov:
666 case Intrinsic::amdgcn_interp_p1_f16:
667 case Intrinsic::amdgcn_interp_p2_f16:
668 return false;
669 default:
670 return true;
671 }
672 }
673 case ISD::SELECT:
675 default:
676 return true;
677 }
678}
679
681 unsigned CostThreshold) {
682 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
683 // it is truly free to use a source modifier in all cases. If there are
684 // multiple users but for each one will necessitate using VOP3, there will be
685 // a code size increase. Try to avoid increasing code size unless we know it
686 // will save on the instruction count.
687 unsigned NumMayIncreaseSize = 0;
688 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
689
690 assert(!N->use_empty());
691
692 // XXX - Should this limit number of uses to check?
693 for (const SDNode *U : N->uses()) {
694 if (!hasSourceMods(U))
695 return false;
696
697 if (!opMustUseVOP3Encoding(U, VT)) {
698 if (++NumMayIncreaseSize > CostThreshold)
699 return false;
700 }
701 }
702
703 return true;
704}
705
707 ISD::NodeType ExtendKind) const {
708 assert(!VT.isVector() && "only scalar expected");
709
710 // Round to the next multiple of 32-bits.
711 unsigned Size = VT.getSizeInBits();
712 if (Size <= 32)
713 return MVT::i32;
714 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
715}
716
718 return MVT::i32;
719}
720
722 return true;
723}
724
725// The backend supports 32 and 64 bit floating point immediates.
726// FIXME: Why are we reporting vectors of FP immediates as legal?
728 bool ForCodeSize) const {
729 EVT ScalarVT = VT.getScalarType();
730 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
731 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
732}
733
734// We don't want to shrink f64 / f32 constants.
736 EVT ScalarVT = VT.getScalarType();
737 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
738}
739
741 ISD::LoadExtType ExtTy,
742 EVT NewVT) const {
743 // TODO: This may be worth removing. Check regression tests for diffs.
745 return false;
746
747 unsigned NewSize = NewVT.getStoreSizeInBits();
748
749 // If we are reducing to a 32-bit load or a smaller multi-dword load,
750 // this is always better.
751 if (NewSize >= 32)
752 return true;
753
754 EVT OldVT = N->getValueType(0);
755 unsigned OldSize = OldVT.getStoreSizeInBits();
756
757 MemSDNode *MN = cast<MemSDNode>(N);
758 unsigned AS = MN->getAddressSpace();
759 // Do not shrink an aligned scalar load to sub-dword.
760 // Scalar engine cannot do sub-dword loads.
761 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
764 (isa<LoadSDNode>(N) && AS == AMDGPUAS::GLOBAL_ADDRESS &&
765 MN->isInvariant())) &&
767 return false;
768
769 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
770 // extloads, so doing one requires using a buffer_load. In cases where we
771 // still couldn't use a scalar load, using the wider load shouldn't really
772 // hurt anything.
773
774 // If the old size already had to be an extload, there's no harm in continuing
775 // to reduce the width.
776 return (OldSize < 32);
777}
778
780 const SelectionDAG &DAG,
781 const MachineMemOperand &MMO) const {
782
783 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
784
785 if (LoadTy.getScalarType() == MVT::i32)
786 return false;
787
788 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
789 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
790
791 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
792 return false;
793
794 unsigned Fast = 0;
796 CastTy, MMO, &Fast) &&
797 Fast;
798}
799
800// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
801// profitable with the expansion for 64-bit since it's generally good to
802// speculate things.
804 return true;
805}
806
808 return true;
809}
810
812 switch (N->getOpcode()) {
813 case ISD::EntryToken:
814 case ISD::TokenFactor:
815 return true;
817 unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
818 switch (IntrID) {
819 case Intrinsic::amdgcn_readfirstlane:
820 case Intrinsic::amdgcn_readlane:
821 return true;
822 }
823 return false;
824 }
825 case ISD::LOAD:
826 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
828 return true;
829 return false;
830 case AMDGPUISD::SETCC: // ballot-style instruction
831 return true;
832 }
833 return false;
834}
835
837 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
838 NegatibleCost &Cost, unsigned Depth) const {
839
840 switch (Op.getOpcode()) {
841 case ISD::FMA:
842 case ISD::FMAD: {
843 // Negating a fma is not free if it has users without source mods.
844 if (!allUsesHaveSourceMods(Op.getNode()))
845 return SDValue();
846 break;
847 }
848 case AMDGPUISD::RCP: {
849 SDValue Src = Op.getOperand(0);
850 EVT VT = Op.getValueType();
851 SDLoc SL(Op);
852
853 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
854 ForCodeSize, Cost, Depth + 1);
855 if (NegSrc)
856 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
857 return SDValue();
858 }
859 default:
860 break;
861 }
862
863 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
864 ForCodeSize, Cost, Depth);
865}
866
867//===---------------------------------------------------------------------===//
868// Target Properties
869//===---------------------------------------------------------------------===//
870
873
874 // Packed operations do not have a fabs modifier.
875 return VT == MVT::f32 || VT == MVT::f64 ||
876 (Subtarget->has16BitInsts() && VT == MVT::f16);
877}
878
881 // Report this based on the end legalized type.
882 VT = VT.getScalarType();
883 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
884}
885
887 unsigned NumElem,
888 unsigned AS) const {
889 return true;
890}
891
893 // There are few operations which truly have vector input operands. Any vector
894 // operation is going to involve operations on each component, and a
895 // build_vector will be a copy per element, so it always makes sense to use a
896 // build_vector input in place of the extracted element to avoid a copy into a
897 // super register.
898 //
899 // We should probably only do this if all users are extracts only, but this
900 // should be the common case.
901 return true;
902}
903
905 // Truncate is just accessing a subregister.
906
907 unsigned SrcSize = Source.getSizeInBits();
908 unsigned DestSize = Dest.getSizeInBits();
909
910 return DestSize < SrcSize && DestSize % 32 == 0 ;
911}
912
914 // Truncate is just accessing a subregister.
915
916 unsigned SrcSize = Source->getScalarSizeInBits();
917 unsigned DestSize = Dest->getScalarSizeInBits();
918
919 if (DestSize== 16 && Subtarget->has16BitInsts())
920 return SrcSize >= 32;
921
922 return DestSize < SrcSize && DestSize % 32 == 0;
923}
924
926 unsigned SrcSize = Src->getScalarSizeInBits();
927 unsigned DestSize = Dest->getScalarSizeInBits();
928
929 if (SrcSize == 16 && Subtarget->has16BitInsts())
930 return DestSize >= 32;
931
932 return SrcSize == 32 && DestSize == 64;
933}
934
936 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
937 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
938 // this will enable reducing 64-bit operations the 32-bit, which is always
939 // good.
940
941 if (Src == MVT::i16)
942 return Dest == MVT::i32 ||Dest == MVT::i64 ;
943
944 return Src == MVT::i32 && Dest == MVT::i64;
945}
946
948 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
949 // limited number of native 64-bit operations. Shrinking an operation to fit
950 // in a single 32-bit register should always be helpful. As currently used,
951 // this is much less general than the name suggests, and is only used in
952 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
953 // not profitable, and may actually be harmful.
954 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
955}
956
958 const SDNode* N, CombineLevel Level) const {
959 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
960 N->getOpcode() == ISD::SRL) &&
961 "Expected shift op");
962 // Always commute pre-type legalization and right shifts.
963 // We're looking for shl(or(x,y),z) patterns.
965 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
966 return true;
967
968 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
969 if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
970 (N->use_begin()->getOpcode() == ISD::SRA ||
971 N->use_begin()->getOpcode() == ISD::SRL))
972 return false;
973
974 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
975 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
976 if (LHS.getOpcode() != ISD::SHL)
977 return false;
978 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
979 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
980 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
981 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
982 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
983 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
984 };
985 SDValue LHS = N->getOperand(0).getOperand(0);
986 SDValue RHS = N->getOperand(0).getOperand(1);
987 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
988}
989
990//===---------------------------------------------------------------------===//
991// TargetLowering Callbacks
992//===---------------------------------------------------------------------===//
993
995 bool IsVarArg) {
996 switch (CC) {
1004 return CC_AMDGPU;
1007 return CC_AMDGPU_CS_CHAIN;
1008 case CallingConv::C:
1009 case CallingConv::Fast:
1010 case CallingConv::Cold:
1011 return CC_AMDGPU_Func;
1013 return CC_SI_Gfx;
1016 default:
1017 report_fatal_error("Unsupported calling convention for call");
1018 }
1019}
1020
1022 bool IsVarArg) {
1023 switch (CC) {
1026 llvm_unreachable("kernels should not be handled here");
1036 return RetCC_SI_Shader;
1038 return RetCC_SI_Gfx;
1039 case CallingConv::C:
1040 case CallingConv::Fast:
1041 case CallingConv::Cold:
1042 return RetCC_AMDGPU_Func;
1043 default:
1044 report_fatal_error("Unsupported calling convention.");
1045 }
1046}
1047
1048/// The SelectionDAGBuilder will automatically promote function arguments
1049/// with illegal types. However, this does not work for the AMDGPU targets
1050/// since the function arguments are stored in memory as these illegal types.
1051/// In order to handle this properly we need to get the original types sizes
1052/// from the LLVM IR Function and fixup the ISD:InputArg values before
1053/// passing them to AnalyzeFormalArguments()
1054
1055/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1056/// input values across multiple registers. Each item in the Ins array
1057/// represents a single value that will be stored in registers. Ins[x].VT is
1058/// the value type of the value that will be stored in the register, so
1059/// whatever SDNode we lower the argument to needs to be this type.
1060///
1061/// In order to correctly lower the arguments we need to know the size of each
1062/// argument. Since Ins[x].VT gives us the size of the register that will
1063/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1064/// for the original function argument so that we can deduce the correct memory
1065/// type to use for Ins[x]. In most cases the correct memory type will be
1066/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1067/// we have a kernel argument of type v8i8, this argument will be split into
1068/// 8 parts and each part will be represented by its own item in the Ins array.
1069/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1070/// the argument before it was split. From this, we deduce that the memory type
1071/// for each individual part is i8. We pass the memory type as LocVT to the
1072/// calling convention analysis function and the register type (Ins[x].VT) as
1073/// the ValVT.
1075 CCState &State,
1076 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1077 const MachineFunction &MF = State.getMachineFunction();
1078 const Function &Fn = MF.getFunction();
1079 LLVMContext &Ctx = Fn.getParent()->getContext();
1080 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1081 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1083
1084 Align MaxAlign = Align(1);
1085 uint64_t ExplicitArgOffset = 0;
1086 const DataLayout &DL = Fn.getParent()->getDataLayout();
1087
1088 unsigned InIndex = 0;
1089
1090 for (const Argument &Arg : Fn.args()) {
1091 const bool IsByRef = Arg.hasByRefAttr();
1092 Type *BaseArgTy = Arg.getType();
1093 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1094 Align Alignment = DL.getValueOrABITypeAlignment(
1095 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1096 MaxAlign = std::max(Alignment, MaxAlign);
1097 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1098
1099 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1100 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1101
1102 // We're basically throwing away everything passed into us and starting over
1103 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1104 // to us as computed in Ins.
1105 //
1106 // We also need to figure out what type legalization is trying to do to get
1107 // the correct memory offsets.
1108
1109 SmallVector<EVT, 16> ValueVTs;
1111 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1112
1113 for (unsigned Value = 0, NumValues = ValueVTs.size();
1114 Value != NumValues; ++Value) {
1115 uint64_t BasePartOffset = Offsets[Value];
1116
1117 EVT ArgVT = ValueVTs[Value];
1118 EVT MemVT = ArgVT;
1119 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1120 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1121
1122 if (NumRegs == 1) {
1123 // This argument is not split, so the IR type is the memory type.
1124 if (ArgVT.isExtended()) {
1125 // We have an extended type, like i24, so we should just use the
1126 // register type.
1127 MemVT = RegisterVT;
1128 } else {
1129 MemVT = ArgVT;
1130 }
1131 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1132 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1133 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1134 // We have a vector value which has been split into a vector with
1135 // the same scalar type, but fewer elements. This should handle
1136 // all the floating-point vector types.
1137 MemVT = RegisterVT;
1138 } else if (ArgVT.isVector() &&
1139 ArgVT.getVectorNumElements() == NumRegs) {
1140 // This arg has been split so that each element is stored in a separate
1141 // register.
1142 MemVT = ArgVT.getScalarType();
1143 } else if (ArgVT.isExtended()) {
1144 // We have an extended type, like i65.
1145 MemVT = RegisterVT;
1146 } else {
1147 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1148 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1149 if (RegisterVT.isInteger()) {
1150 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1151 } else if (RegisterVT.isVector()) {
1152 assert(!RegisterVT.getScalarType().isFloatingPoint());
1153 unsigned NumElements = RegisterVT.getVectorNumElements();
1154 assert(MemoryBits % NumElements == 0);
1155 // This vector type has been split into another vector type with
1156 // a different elements size.
1157 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1158 MemoryBits / NumElements);
1159 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1160 } else {
1161 llvm_unreachable("cannot deduce memory type.");
1162 }
1163 }
1164
1165 // Convert one element vectors to scalar.
1166 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1167 MemVT = MemVT.getScalarType();
1168
1169 // Round up vec3/vec5 argument.
1170 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1171 assert(MemVT.getVectorNumElements() == 3 ||
1172 MemVT.getVectorNumElements() == 5 ||
1173 (MemVT.getVectorNumElements() >= 9 &&
1174 MemVT.getVectorNumElements() <= 12));
1175 MemVT = MemVT.getPow2VectorType(State.getContext());
1176 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1177 MemVT = MemVT.getRoundIntegerType(State.getContext());
1178 }
1179
1180 unsigned PartOffset = 0;
1181 for (unsigned i = 0; i != NumRegs; ++i) {
1182 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1183 BasePartOffset + PartOffset,
1184 MemVT.getSimpleVT(),
1186 PartOffset += MemVT.getStoreSize();
1187 }
1188 }
1189 }
1190}
1191
1193 SDValue Chain, CallingConv::ID CallConv,
1194 bool isVarArg,
1196 const SmallVectorImpl<SDValue> &OutVals,
1197 const SDLoc &DL, SelectionDAG &DAG) const {
1198 // FIXME: Fails for r600 tests
1199 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1200 // "wave terminate should not have return values");
1201 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1202}
1203
1204//===---------------------------------------------------------------------===//
1205// Target specific lowering
1206//===---------------------------------------------------------------------===//
1207
1208/// Selects the correct CCAssignFn for a given CallingConvention value.
1210 bool IsVarArg) {
1211 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1212}
1213
1215 bool IsVarArg) {
1217}
1218
1220 SelectionDAG &DAG,
1221 MachineFrameInfo &MFI,
1222 int ClobberedFI) const {
1223 SmallVector<SDValue, 8> ArgChains;
1224 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1225 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1226
1227 // Include the original chain at the beginning of the list. When this is
1228 // used by target LowerCall hooks, this helps legalize find the
1229 // CALLSEQ_BEGIN node.
1230 ArgChains.push_back(Chain);
1231
1232 // Add a chain value for each stack argument corresponding
1233 for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1234 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1235 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1236 if (FI->getIndex() < 0) {
1237 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1238 int64_t InLastByte = InFirstByte;
1239 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1240
1241 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1242 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1243 ArgChains.push_back(SDValue(L, 1));
1244 }
1245 }
1246 }
1247 }
1248
1249 // Build a tokenfactor for all the chains.
1250 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1251}
1252
1255 StringRef Reason) const {
1256 SDValue Callee = CLI.Callee;
1257 SelectionDAG &DAG = CLI.DAG;
1258
1259 const Function &Fn = DAG.getMachineFunction().getFunction();
1260
1261 StringRef FuncName("<unknown>");
1262
1263 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1264 FuncName = G->getSymbol();
1265 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1266 FuncName = G->getGlobal()->getName();
1267
1269 Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1270 DAG.getContext()->diagnose(NoCalls);
1271
1272 if (!CLI.IsTailCall) {
1273 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1274 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1275 }
1276
1277 return DAG.getEntryNode();
1278}
1279
1281 SmallVectorImpl<SDValue> &InVals) const {
1282 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1283}
1284
1286 SelectionDAG &DAG) const {
1287 const Function &Fn = DAG.getMachineFunction().getFunction();
1288
1289 DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1290 SDLoc(Op).getDebugLoc());
1291 DAG.getContext()->diagnose(NoDynamicAlloca);
1292 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1293 return DAG.getMergeValues(Ops, SDLoc());
1294}
1295
1297 SelectionDAG &DAG) const {
1298 switch (Op.getOpcode()) {
1299 default:
1300 Op->print(errs(), &DAG);
1301 llvm_unreachable("Custom lowering code for this "
1302 "instruction is not implemented yet!");
1303 break;
1305 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1307 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1308 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1309 case ISD::FREM: return LowerFREM(Op, DAG);
1310 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1311 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1312 case ISD::FRINT: return LowerFRINT(Op, DAG);
1313 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1314 case ISD::FROUNDEVEN:
1315 return LowerFROUNDEVEN(Op, DAG);
1316 case ISD::FROUND: return LowerFROUND(Op, DAG);
1317 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1318 case ISD::FLOG2:
1319 return LowerFLOG2(Op, DAG);
1320 case ISD::FLOG:
1321 case ISD::FLOG10:
1322 return LowerFLOGCommon(Op, DAG);
1323 case ISD::FEXP:
1324 return lowerFEXP(Op, DAG);
1325 case ISD::FEXP2:
1326 return lowerFEXP2(Op, DAG);
1327 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1328 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1329 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1330 case ISD::FP_TO_SINT:
1331 case ISD::FP_TO_UINT:
1332 return LowerFP_TO_INT(Op, DAG);
1333 case ISD::CTTZ:
1335 case ISD::CTLZ:
1337 return LowerCTLZ_CTTZ(Op, DAG);
1339 }
1340 return Op;
1341}
1342
1345 SelectionDAG &DAG) const {
1346 switch (N->getOpcode()) {
1348 // Different parts of legalization seem to interpret which type of
1349 // sign_extend_inreg is the one to check for custom lowering. The extended
1350 // from type is what really matters, but some places check for custom
1351 // lowering of the result type. This results in trying to use
1352 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1353 // nothing here and let the illegal result integer be handled normally.
1354 return;
1355 case ISD::FLOG2:
1356 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1357 Results.push_back(Lowered);
1358 return;
1359 case ISD::FLOG:
1360 case ISD::FLOG10:
1361 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1362 Results.push_back(Lowered);
1363 return;
1364 case ISD::FEXP2:
1365 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1366 Results.push_back(Lowered);
1367 return;
1368 case ISD::FEXP:
1369 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1370 Results.push_back(Lowered);
1371 return;
1372 default:
1373 return;
1374 }
1375}
1376
1378 SDValue Op,
1379 SelectionDAG &DAG) const {
1380
1381 const DataLayout &DL = DAG.getDataLayout();
1382 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1383 const GlobalValue *GV = G->getGlobal();
1384
1385 if (!MFI->isModuleEntryFunction()) {
1386 if (std::optional<uint32_t> Address =
1388 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1389 }
1390 }
1391
1392 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1393 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1394 if (!MFI->isModuleEntryFunction() &&
1395 !GV->getName().equals("llvm.amdgcn.module.lds")) {
1396 SDLoc DL(Op);
1397 const Function &Fn = DAG.getMachineFunction().getFunction();
1398 DiagnosticInfoUnsupported BadLDSDecl(
1399 Fn, "local memory global used by non-kernel function",
1400 DL.getDebugLoc(), DS_Warning);
1401 DAG.getContext()->diagnose(BadLDSDecl);
1402
1403 // We currently don't have a way to correctly allocate LDS objects that
1404 // aren't directly associated with a kernel. We do force inlining of
1405 // functions that use local objects. However, if these dead functions are
1406 // not eliminated, we don't want a compile time error. Just emit a warning
1407 // and a trap, since there should be no callable path here.
1408 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1409 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1410 Trap, DAG.getRoot());
1411 DAG.setRoot(OutputChain);
1412 return DAG.getUNDEF(Op.getValueType());
1413 }
1414
1415 // XXX: What does the value of G->getOffset() mean?
1416 assert(G->getOffset() == 0 &&
1417 "Do not know what to do with an non-zero offset");
1418
1419 // TODO: We could emit code to handle the initialization somewhere.
1420 // We ignore the initializer for now and legalize it to allow selection.
1421 // The initializer will anyway get errored out during assembly emission.
1422 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1423 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1424 }
1425 return SDValue();
1426}
1427
1429 SelectionDAG &DAG) const {
1431 SDLoc SL(Op);
1432
1433 EVT VT = Op.getValueType();
1434 if (VT.getVectorElementType().getSizeInBits() < 32) {
1435 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1436 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1437 unsigned NewNumElt = OpBitSize / 32;
1438 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1440 MVT::i32, NewNumElt);
1441 for (const SDUse &U : Op->ops()) {
1442 SDValue In = U.get();
1443 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1444 if (NewNumElt > 1)
1445 DAG.ExtractVectorElements(NewIn, Args);
1446 else
1447 Args.push_back(NewIn);
1448 }
1449
1450 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1451 NewNumElt * Op.getNumOperands());
1452 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1453 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1454 }
1455 }
1456
1457 for (const SDUse &U : Op->ops())
1458 DAG.ExtractVectorElements(U.get(), Args);
1459
1460 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1461}
1462
1464 SelectionDAG &DAG) const {
1465 SDLoc SL(Op);
1467 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1468 EVT VT = Op.getValueType();
1469 EVT SrcVT = Op.getOperand(0).getValueType();
1470
1471 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1472 unsigned NumElt = VT.getVectorNumElements();
1473 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1474 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1475
1476 // Extract 32-bit registers at a time.
1477 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1478 EVT NewVT = NumElt == 2
1479 ? MVT::i32
1480 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1481 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1482
1483 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1484 if (NumElt == 2)
1485 Tmp = Args[0];
1486 else
1487 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1488
1489 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1490 }
1491
1492 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1494
1495 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1496}
1497
1498// TODO: Handle fabs too
1500 if (Val.getOpcode() == ISD::FNEG)
1501 return Val.getOperand(0);
1502
1503 return Val;
1504}
1505
1507 if (Val.getOpcode() == ISD::FNEG)
1508 Val = Val.getOperand(0);
1509 if (Val.getOpcode() == ISD::FABS)
1510 Val = Val.getOperand(0);
1511 if (Val.getOpcode() == ISD::FCOPYSIGN)
1512 Val = Val.getOperand(0);
1513 return Val;
1514}
1515
1517 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1518 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1519 SelectionDAG &DAG = DCI.DAG;
1520 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1521 switch (CCOpcode) {
1522 case ISD::SETOEQ:
1523 case ISD::SETONE:
1524 case ISD::SETUNE:
1525 case ISD::SETNE:
1526 case ISD::SETUEQ:
1527 case ISD::SETEQ:
1528 case ISD::SETFALSE:
1529 case ISD::SETFALSE2:
1530 case ISD::SETTRUE:
1531 case ISD::SETTRUE2:
1532 case ISD::SETUO:
1533 case ISD::SETO:
1534 break;
1535 case ISD::SETULE:
1536 case ISD::SETULT: {
1537 if (LHS == True)
1538 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1539 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1540 }
1541 case ISD::SETOLE:
1542 case ISD::SETOLT:
1543 case ISD::SETLE:
1544 case ISD::SETLT: {
1545 // Ordered. Assume ordered for undefined.
1546
1547 // Only do this after legalization to avoid interfering with other combines
1548 // which might occur.
1550 !DCI.isCalledByLegalizer())
1551 return SDValue();
1552
1553 // We need to permute the operands to get the correct NaN behavior. The
1554 // selected operand is the second one based on the failing compare with NaN,
1555 // so permute it based on the compare type the hardware uses.
1556 if (LHS == True)
1557 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1558 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1559 }
1560 case ISD::SETUGE:
1561 case ISD::SETUGT: {
1562 if (LHS == True)
1563 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1564 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1565 }
1566 case ISD::SETGT:
1567 case ISD::SETGE:
1568 case ISD::SETOGE:
1569 case ISD::SETOGT: {
1571 !DCI.isCalledByLegalizer())
1572 return SDValue();
1573
1574 if (LHS == True)
1575 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1576 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1577 }
1578 case ISD::SETCC_INVALID:
1579 llvm_unreachable("Invalid setcc condcode!");
1580 }
1581 return SDValue();
1582}
1583
1584/// Generate Min/Max node
1586 SDValue LHS, SDValue RHS,
1587 SDValue True, SDValue False,
1588 SDValue CC,
1589 DAGCombinerInfo &DCI) const {
1590 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1591 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1592
1593 SelectionDAG &DAG = DCI.DAG;
1594
1595 // If we can't directly match this, try to see if we can fold an fneg to
1596 // match.
1597
1598 ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
1599 ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
1600 SDValue NegTrue = peekFNeg(True);
1601
1602 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1603 // fmin/fmax.
1604 //
1605 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1606 // -> fneg (fmin_legacy lhs, K)
1607 //
1608 // TODO: Use getNegatedExpression
1609 if (LHS == NegTrue && CFalse && CRHS) {
1610 APFloat NegRHS = neg(CRHS->getValueAPF());
1611 if (NegRHS == CFalse->getValueAPF()) {
1612 SDValue Combined =
1613 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1614 if (Combined)
1615 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1616 return SDValue();
1617 }
1618 }
1619
1620 return SDValue();
1621}
1622
1623std::pair<SDValue, SDValue>
1625 SDLoc SL(Op);
1626
1627 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1628
1629 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1630 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1631
1632 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1633 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1634
1635 return std::pair(Lo, Hi);
1636}
1637
1639 SDLoc SL(Op);
1640
1641 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1642 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1643 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1644}
1645
1647 SDLoc SL(Op);
1648
1649 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1650 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1651 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1652}
1653
1654// Split a vector type into two parts. The first part is a power of two vector.
1655// The second part is whatever is left over, and is a scalar if it would
1656// otherwise be a 1-vector.
1657std::pair<EVT, EVT>
1659 EVT LoVT, HiVT;
1660 EVT EltVT = VT.getVectorElementType();
1661 unsigned NumElts = VT.getVectorNumElements();
1662 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1663 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1664 HiVT = NumElts - LoNumElts == 1
1665 ? EltVT
1666 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1667 return std::pair(LoVT, HiVT);
1668}
1669
1670// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1671// scalar.
1672std::pair<SDValue, SDValue>
1674 const EVT &LoVT, const EVT &HiVT,
1675 SelectionDAG &DAG) const {
1677 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1678 N.getValueType().getVectorNumElements() &&
1679 "More vector elements requested than available!");
1681 DAG.getVectorIdxConstant(0, DL));
1682 SDValue Hi = DAG.getNode(
1684 HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1685 return std::pair(Lo, Hi);
1686}
1687
1689 SelectionDAG &DAG) const {
1690 LoadSDNode *Load = cast<LoadSDNode>(Op);
1691 EVT VT = Op.getValueType();
1692 SDLoc SL(Op);
1693
1694
1695 // If this is a 2 element vector, we really want to scalarize and not create
1696 // weird 1 element vectors.
1697 if (VT.getVectorNumElements() == 2) {
1698 SDValue Ops[2];
1699 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1700 return DAG.getMergeValues(Ops, SL);
1701 }
1702
1703 SDValue BasePtr = Load->getBasePtr();
1704 EVT MemVT = Load->getMemoryVT();
1705
1706 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1707
1708 EVT LoVT, HiVT;
1709 EVT LoMemVT, HiMemVT;
1710 SDValue Lo, Hi;
1711
1712 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1713 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1714 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1715
1716 unsigned Size = LoMemVT.getStoreSize();
1717 Align BaseAlign = Load->getAlign();
1718 Align HiAlign = commonAlignment(BaseAlign, Size);
1719
1720 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1721 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1722 BaseAlign, Load->getMemOperand()->getFlags());
1723 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1724 SDValue HiLoad =
1725 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1726 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1727 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1728
1729 SDValue Join;
1730 if (LoVT == HiVT) {
1731 // This is the case that the vector is power of two so was evenly split.
1732 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1733 } else {
1734 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1735 DAG.getVectorIdxConstant(0, SL));
1736 Join = DAG.getNode(
1738 VT, Join, HiLoad,
1740 }
1741
1742 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1743 LoLoad.getValue(1), HiLoad.getValue(1))};
1744
1745 return DAG.getMergeValues(Ops, SL);
1746}
1747
1749 SelectionDAG &DAG) const {
1750 LoadSDNode *Load = cast<LoadSDNode>(Op);
1751 EVT VT = Op.getValueType();
1752 SDValue BasePtr = Load->getBasePtr();
1753 EVT MemVT = Load->getMemoryVT();
1754 SDLoc SL(Op);
1755 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1756 Align BaseAlign = Load->getAlign();
1757 unsigned NumElements = MemVT.getVectorNumElements();
1758
1759 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1760 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1761 if (NumElements != 3 ||
1762 (BaseAlign < Align(8) &&
1763 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1764 return SplitVectorLoad(Op, DAG);
1765
1766 assert(NumElements == 3);
1767
1768 EVT WideVT =
1770 EVT WideMemVT =
1772 SDValue WideLoad = DAG.getExtLoad(
1773 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1774 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1775 return DAG.getMergeValues(
1776 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1777 DAG.getVectorIdxConstant(0, SL)),
1778 WideLoad.getValue(1)},
1779 SL);
1780}
1781
1783 SelectionDAG &DAG) const {
1784 StoreSDNode *Store = cast<StoreSDNode>(Op);
1785 SDValue Val = Store->getValue();
1786 EVT VT = Val.getValueType();
1787
1788 // If this is a 2 element vector, we really want to scalarize and not create
1789 // weird 1 element vectors.
1790 if (VT.getVectorNumElements() == 2)
1791 return scalarizeVectorStore(Store, DAG);
1792
1793 EVT MemVT = Store->getMemoryVT();
1794 SDValue Chain = Store->getChain();
1795 SDValue BasePtr = Store->getBasePtr();
1796 SDLoc SL(Op);
1797
1798 EVT LoVT, HiVT;
1799 EVT LoMemVT, HiMemVT;
1800 SDValue Lo, Hi;
1801
1802 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1803 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1804 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1805
1806 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1807
1808 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1809 Align BaseAlign = Store->getAlign();
1810 unsigned Size = LoMemVT.getStoreSize();
1811 Align HiAlign = commonAlignment(BaseAlign, Size);
1812
1813 SDValue LoStore =
1814 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1815 Store->getMemOperand()->getFlags());
1816 SDValue HiStore =
1817 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1818 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1819
1820 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1821}
1822
1823// This is a shortcut for integer division because we have fast i32<->f32
1824// conversions, and fast f32 reciprocal instructions. The fractional part of a
1825// float is enough to accurately represent up to a 24-bit signed integer.
1827 bool Sign) const {
1828 SDLoc DL(Op);
1829 EVT VT = Op.getValueType();
1830 SDValue LHS = Op.getOperand(0);
1831 SDValue RHS = Op.getOperand(1);
1832 MVT IntVT = MVT::i32;
1833 MVT FltVT = MVT::f32;
1834
1835 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1836 if (LHSSignBits < 9)
1837 return SDValue();
1838
1839 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1840 if (RHSSignBits < 9)
1841 return SDValue();
1842
1843 unsigned BitSize = VT.getSizeInBits();
1844 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1845 unsigned DivBits = BitSize - SignBits;
1846 if (Sign)
1847 ++DivBits;
1848
1851
1852 SDValue jq = DAG.getConstant(1, DL, IntVT);
1853
1854 if (Sign) {
1855 // char|short jq = ia ^ ib;
1856 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1857
1858 // jq = jq >> (bitsize - 2)
1859 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1860 DAG.getConstant(BitSize - 2, DL, VT));
1861
1862 // jq = jq | 0x1
1863 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1864 }
1865
1866 // int ia = (int)LHS;
1867 SDValue ia = LHS;
1868
1869 // int ib, (int)RHS;
1870 SDValue ib = RHS;
1871
1872 // float fa = (float)ia;
1873 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1874
1875 // float fb = (float)ib;
1876 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1877
1878 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1879 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1880
1881 // fq = trunc(fq);
1882 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1883
1884 // float fqneg = -fq;
1885 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1886
1888
1889 bool UseFmadFtz = false;
1890 if (Subtarget->isGCN()) {
1892 UseFmadFtz =
1894 }
1895
1896 // float fr = mad(fqneg, fb, fa);
1897 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1898 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
1900 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1901
1902 // int iq = (int)fq;
1903 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1904
1905 // fr = fabs(fr);
1906 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1907
1908 // fb = fabs(fb);
1909 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1910
1911 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1912
1913 // int cv = fr >= fb;
1914 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1915
1916 // jq = (cv ? jq : 0);
1917 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1918
1919 // dst = iq + jq;
1920 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1921
1922 // Rem needs compensation, it's easier to recompute it
1923 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1924 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1925
1926 // Truncate to number of bits this divide really is.
1927 if (Sign) {
1928 SDValue InRegSize
1929 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1930 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1931 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1932 } else {
1933 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1934 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1935 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1936 }
1937
1938 return DAG.getMergeValues({ Div, Rem }, DL);
1939}
1940
1942 SelectionDAG &DAG,
1944 SDLoc DL(Op);
1945 EVT VT = Op.getValueType();
1946
1947 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1948
1949 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1950
1951 SDValue One = DAG.getConstant(1, DL, HalfVT);
1952 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1953
1954 //HiLo split
1955 SDValue LHS_Lo, LHS_Hi;
1956 SDValue LHS = Op.getOperand(0);
1957 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
1958
1959 SDValue RHS_Lo, RHS_Hi;
1960 SDValue RHS = Op.getOperand(1);
1961 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
1962
1963 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1965
1966 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1967 LHS_Lo, RHS_Lo);
1968
1969 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1970 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1971
1972 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1973 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1974 return;
1975 }
1976
1977 if (isTypeLegal(MVT::i64)) {
1978 // The algorithm here is based on ideas from "Software Integer Division",
1979 // Tom Rodeheffer, August 2008.
1980
1983
1984 // Compute denominator reciprocal.
1985 unsigned FMAD =
1986 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
1989 : (unsigned)AMDGPUISD::FMAD_FTZ;
1990
1991 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1992 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1993 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1994 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1995 Cvt_Lo);
1996 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1997 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1998 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1999 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2000 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2001 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2002 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2003 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2004 Mul1);
2005 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2006 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2007 SDValue Rcp64 = DAG.getBitcast(VT,
2008 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2009
2010 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2011 SDValue One64 = DAG.getConstant(1, DL, VT);
2012 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2013 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2014
2015 // First round of UNR (Unsigned integer Newton-Raphson).
2016 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2017 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2018 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2019 SDValue Mulhi1_Lo, Mulhi1_Hi;
2020 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2021 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2022 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2023 Mulhi1_Lo, Zero1);
2024 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2025 Mulhi1_Hi, Add1_Lo.getValue(1));
2026 SDValue Add1 = DAG.getBitcast(VT,
2027 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2028
2029 // Second round of UNR.
2030 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2031 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2032 SDValue Mulhi2_Lo, Mulhi2_Hi;
2033 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2034 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2035 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2036 Mulhi2_Lo, Zero1);
2037 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2038 Mulhi2_Hi, Add2_Lo.getValue(1));
2039 SDValue Add2 = DAG.getBitcast(VT,
2040 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2041
2042 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2043
2044 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2045
2046 SDValue Mul3_Lo, Mul3_Hi;
2047 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2048 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2049 Mul3_Lo, Zero1);
2050 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2051 Mul3_Hi, Sub1_Lo.getValue(1));
2052 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2053 SDValue Sub1 = DAG.getBitcast(VT,
2054 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2055
2056 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2057 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2058 ISD::SETUGE);
2059 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2060 ISD::SETUGE);
2061 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2062
2063 // TODO: Here and below portions of the code can be enclosed into if/endif.
2064 // Currently control flow is unconditional and we have 4 selects after
2065 // potential endif to substitute PHIs.
2066
2067 // if C3 != 0 ...
2068 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2069 RHS_Lo, Zero1);
2070 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2071 RHS_Hi, Sub1_Lo.getValue(1));
2072 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2073 Zero, Sub2_Lo.getValue(1));
2074 SDValue Sub2 = DAG.getBitcast(VT,
2075 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2076
2077 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2078
2079 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2080 ISD::SETUGE);
2081 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2082 ISD::SETUGE);
2083 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2084
2085 // if (C6 != 0)
2086 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2087
2088 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2089 RHS_Lo, Zero1);
2090 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2091 RHS_Hi, Sub2_Lo.getValue(1));
2092 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2093 Zero, Sub3_Lo.getValue(1));
2094 SDValue Sub3 = DAG.getBitcast(VT,
2095 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2096
2097 // endif C6
2098 // endif C3
2099
2100 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2101 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2102
2103 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2104 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2105
2106 Results.push_back(Div);
2107 Results.push_back(Rem);
2108
2109 return;
2110 }
2111
2112 // r600 expandion.
2113 // Get Speculative values
2114 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2115 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2116
2117 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2118 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2119 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2120
2121 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2122 SDValue DIV_Lo = Zero;
2123
2124 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2125
2126 for (unsigned i = 0; i < halfBitWidth; ++i) {
2127 const unsigned bitPos = halfBitWidth - i - 1;
2128 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2129 // Get value of high bit
2130 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2131 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2132 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2133
2134 // Shift
2135 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2136 // Add LHS high bit
2137 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2138
2139 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2140 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2141
2142 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2143
2144 // Update REM
2145 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2146 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2147 }
2148
2149 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2150 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2151 Results.push_back(DIV);
2152 Results.push_back(REM);
2153}
2154
2156 SelectionDAG &DAG) const {
2157 SDLoc DL(Op);
2158 EVT VT = Op.getValueType();
2159
2160 if (VT == MVT::i64) {
2162 LowerUDIVREM64(Op, DAG, Results);
2163 return DAG.getMergeValues(Results, DL);
2164 }
2165
2166 if (VT == MVT::i32) {
2167 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2168 return Res;
2169 }
2170
2171 SDValue X = Op.getOperand(0);
2172 SDValue Y = Op.getOperand(1);
2173
2174 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2175 // algorithm used here.
2176
2177 // Initial estimate of inv(y).
2178 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2179
2180 // One round of UNR.
2181 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2182 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2183 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2184 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2185
2186 // Quotient/remainder estimate.
2187 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2188 SDValue R =
2189 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2190
2191 // First quotient/remainder refinement.
2192 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2193 SDValue One = DAG.getConstant(1, DL, VT);
2194 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2195 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2196 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2197 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2198 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2199
2200 // Second quotient/remainder refinement.
2201 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2202 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2203 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2204 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2205 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2206
2207 return DAG.getMergeValues({Q, R}, DL);
2208}
2209
2211 SelectionDAG &DAG) const {
2212 SDLoc DL(Op);
2213 EVT VT = Op.getValueType();
2214
2215 SDValue LHS = Op.getOperand(0);
2216 SDValue RHS = Op.getOperand(1);
2217
2218 SDValue Zero = DAG.getConstant(0, DL, VT);
2219 SDValue NegOne = DAG.getConstant(-1, DL, VT);
2220
2221 if (VT == MVT::i32) {
2222 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2223 return Res;
2224 }
2225
2226 if (VT == MVT::i64 &&
2227 DAG.ComputeNumSignBits(LHS) > 32 &&
2228 DAG.ComputeNumSignBits(RHS) > 32) {
2229 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2230
2231 //HiLo split
2232 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2233 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2234 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2235 LHS_Lo, RHS_Lo);
2236 SDValue Res[2] = {
2237 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2238 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2239 };
2240 return DAG.getMergeValues(Res, DL);
2241 }
2242
2243 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2244 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2245 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2246 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2247
2248 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2249 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2250
2251 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2252 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2253
2254 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2255 SDValue Rem = Div.getValue(1);
2256
2257 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2258 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2259
2260 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2261 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2262
2263 SDValue Res[2] = {
2264 Div,
2265 Rem
2266 };
2267 return DAG.getMergeValues(Res, DL);
2268}
2269
2270// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2272 SDLoc SL(Op);
2273 EVT VT = Op.getValueType();
2274 auto Flags = Op->getFlags();
2275 SDValue X = Op.getOperand(0);
2276 SDValue Y = Op.getOperand(1);
2277
2278 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2279 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2280 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2281 // TODO: For f32 use FMAD instead if !hasFastFMA32?
2282 return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2283}
2284
2286 SDLoc SL(Op);
2287 SDValue Src = Op.getOperand(0);
2288
2289 // result = trunc(src)
2290 // if (src > 0.0 && src != result)
2291 // result += 1.0
2292
2293 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2294
2295 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2296 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2297
2298 EVT SetCCVT =
2299 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2300
2301 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2302 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2303 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2304
2305 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2306 // TODO: Should this propagate fast-math-flags?
2307 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2308}
2309
2311 SelectionDAG &DAG) {
2312 const unsigned FractBits = 52;
2313 const unsigned ExpBits = 11;
2314
2315 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2316 Hi,
2317 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2318 DAG.getConstant(ExpBits, SL, MVT::i32));
2319 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2320 DAG.getConstant(1023, SL, MVT::i32));
2321
2322 return Exp;
2323}
2324
2326 SDLoc SL(Op);
2327 SDValue Src = Op.getOperand(0);
2328
2329 assert(Op.getValueType() == MVT::f64);
2330
2331 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2332
2333 // Extract the upper half, since this is where we will find the sign and
2334 // exponent.
2335 SDValue Hi = getHiHalf64(Src, DAG);
2336
2337 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2338
2339 const unsigned FractBits = 52;
2340
2341 // Extract the sign bit.
2342 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2343 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2344
2345 // Extend back to 64-bits.
2346 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2347 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2348
2349 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2350 const SDValue FractMask
2351 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2352
2353 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2354 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2355 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2356
2357 EVT SetCCVT =
2358 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2359
2360 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2361
2362 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2363 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2364
2365 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2366 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2367
2368 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2369}
2370
2372 SDLoc SL(Op);
2373 SDValue Src = Op.getOperand(0);
2374
2375 assert(Op.getValueType() == MVT::f64);
2376
2377 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2378 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2379 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2380
2381 // TODO: Should this propagate fast-math-flags?
2382
2383 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2384 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2385
2386 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2387
2388 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2389 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2390
2391 EVT SetCCVT =
2392 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2393 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2394
2395 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2396}
2397
2399 // FNEARBYINT and FRINT are the same, except in their handling of FP
2400 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2401 // rint, so just treat them as equivalent.
2402 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2403}
2404
2406 SelectionDAG &DAG) const {
2407 auto VT = Op.getValueType();
2408 auto Arg = Op.getOperand(0u);
2409 return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
2410}
2411
2412// XXX - May require not supporting f32 denormals?
2413
2414// Don't handle v2f16. The extra instructions to scalarize and repack around the
2415// compare and vselect end up producing worse code than scalarizing the whole
2416// operation.
2418 SDLoc SL(Op);
2419 SDValue X = Op.getOperand(0);
2420 EVT VT = Op.getValueType();
2421
2422 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2423
2424 // TODO: Should this propagate fast-math-flags?
2425
2426 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2427
2428 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2429
2430 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2431 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2432
2433 EVT SetCCVT =
2434 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2435
2436 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2437 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2438 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2439
2440 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2441 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2442}
2443
2445 SDLoc SL(Op);
2446 SDValue Src = Op.getOperand(0);
2447
2448 // result = trunc(src);
2449 // if (src < 0.0 && src != result)
2450 // result += -1.0.
2451
2452 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2453
2454 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2455 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2456
2457 EVT SetCCVT =
2458 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2459
2460 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2461 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2462 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2463
2464 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2465 // TODO: Should this propagate fast-math-flags?
2466 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2467}
2468
2469/// Return true if it's known that \p Src can never be an f32 denormal value.
2471 switch (Src.getOpcode()) {
2472 case ISD::FP_EXTEND:
2473 return Src.getOperand(0).getValueType() == MVT::f16;
2474 case ISD::FP16_TO_FP:
2475 case ISD::FFREXP:
2476 return true;
2478 unsigned IntrinsicID =
2479 cast<ConstantSDNode>(Src.getOperand(0))->getZExtValue();
2480 switch (IntrinsicID) {
2481 case Intrinsic::amdgcn_frexp_mant:
2482 return true;
2483 default:
2484 return false;
2485 }
2486 }
2487 default:
2488 return false;
2489 }
2490
2491 llvm_unreachable("covered opcode switch");
2492}
2493
2495 SDNodeFlags Flags) {
2496 if (Flags.hasApproximateFuncs())
2497 return true;
2498 auto &Options = DAG.getTarget().Options;
2499 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2500}
2501
2503 SDValue Src,
2504 SDNodeFlags Flags) {
2505 return !valueIsKnownNeverF32Denorm(Src) &&
2506 DAG.getMachineFunction()
2509}
2510
2512 SDValue Src,
2513 SDNodeFlags Flags) const {
2514 SDLoc SL(Src);
2515 EVT VT = Src.getValueType();
2517 SDValue SmallestNormal =
2518 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2519
2520 // Want to scale denormals up, but negatives and 0 work just as well on the
2521 // scaled path.
2522 SDValue IsLtSmallestNormal = DAG.getSetCC(
2523 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2524 SmallestNormal, ISD::SETOLT);
2525
2526 return IsLtSmallestNormal;
2527}
2528
2530 SDNodeFlags Flags) const {
2531 SDLoc SL(Src);
2532 EVT VT = Src.getValueType();
2534 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2535
2536 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2537 SDValue IsFinite = DAG.getSetCC(
2538 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2539 Inf, ISD::SETOLT);
2540 return IsFinite;
2541}
2542
2543/// If denormal handling is required return the scaled input to FLOG2, and the
2544/// check for denormal range. Otherwise, return null values.
2545std::pair<SDValue, SDValue>
2547 SDValue Src, SDNodeFlags Flags) const {
2548 if (!needsDenormHandlingF32(DAG, Src, Flags))
2549 return {};
2550
2551 MVT VT = MVT::f32;
2552 const fltSemantics &Semantics = APFloat::IEEEsingle();
2553 SDValue SmallestNormal =
2554 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2555
2556 SDValue IsLtSmallestNormal = DAG.getSetCC(
2557 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2558 SmallestNormal, ISD::SETOLT);
2559
2560 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2561 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2562 SDValue ScaleFactor =
2563 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2564
2565 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2566 return {ScaledInput, IsLtSmallestNormal};
2567}
2568
2570 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2571 // If we have to handle denormals, scale up the input and adjust the result.
2572
2573 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2574 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2575
2576 SDLoc SL(Op);
2577 EVT VT = Op.getValueType();
2578 SDValue Src = Op.getOperand(0);
2579 SDNodeFlags Flags = Op->getFlags();
2580
2581 if (VT == MVT::f16) {
2582 // Nothing in half is a denormal when promoted to f32.
2583 assert(!Subtarget->has16BitInsts());
2584 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2585 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2586 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2587 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2588 }
2589
2590 auto [ScaledInput, IsLtSmallestNormal] =
2591 getScaledLogInput(DAG, SL, Src, Flags);
2592 if (!ScaledInput)
2593 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2594
2595 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2596
2597 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2598 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2599 SDValue ResultOffset =
2600 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2601 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2602}
2603
2604static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2605 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2606 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2607 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2608}
2609
2611 SelectionDAG &DAG) const {
2612 SDValue X = Op.getOperand(0);
2613 EVT VT = Op.getValueType();
2614 SDNodeFlags Flags = Op->getFlags();
2615 SDLoc DL(Op);
2616
2617 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2618 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2619
2620 const auto &Options = getTargetMachine().Options;
2621 if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2622 Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2623
2624 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2625 // Log and multiply in f32 is good enough for f16.
2626 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2627 }
2628
2629 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2630 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2631 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2632 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2633 }
2634
2635 return Lowered;
2636 }
2637
2638 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2639 if (ScaledInput)
2640 X = ScaledInput;
2641
2642 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2643
2644 SDValue R;
2645 if (Subtarget->hasFastFMAF32()) {
2646 // c+cc are ln(2)/ln(10) to more than 49 bits
2647 const float c_log10 = 0x1.344134p-2f;
2648 const float cc_log10 = 0x1.09f79ep-26f;
2649
2650 // c + cc is ln(2) to more than 49 bits
2651 const float c_log = 0x1.62e42ep-1f;
2652 const float cc_log = 0x1.efa39ep-25f;
2653
2654 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2655 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2656
2657 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2658 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2659 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2660 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2661 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2662 } else {
2663 // ch+ct is ln(2)/ln(10) to more than 36 bits
2664 const float ch_log10 = 0x1.344000p-2f;
2665 const float ct_log10 = 0x1.3509f6p-18f;
2666
2667 // ch + ct is ln(2) to more than 36 bits
2668 const float ch_log = 0x1.62e000p-1f;
2669 const float ct_log = 0x1.0bfbe8p-15f;
2670
2671 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2672 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2673
2674 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2675 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2676 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2677 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2678 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2679
2680 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2681 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2682 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2683 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2684 }
2685
2686 const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
2687 (Flags.hasNoInfs() || Options.NoInfsFPMath);
2688
2689 // TODO: Check if known finite from source value.
2690 if (!IsFiniteOnly) {
2691 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2692 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2693 }
2694
2695 if (IsScaled) {
2696 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2697 SDValue ShiftK =
2698 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2699 SDValue Shift =
2700 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2701 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2702 }
2703
2704 return R;
2705}
2706
2708 return LowerFLOGCommon(Op, DAG);
2709}
2710
2711// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2712// promote f16 operation.
2714 SelectionDAG &DAG, bool IsLog10,
2715 SDNodeFlags Flags) const {
2716 EVT VT = Src.getValueType();
2717 unsigned LogOp =
2718 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2719
2720 double Log2BaseInverted =
2722
2723 if (VT == MVT::f32) {
2724 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2725 if (ScaledInput) {
2726 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2727 SDValue ScaledResultOffset =
2728 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2729
2730 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2731
2732 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2733 ScaledResultOffset, Zero, Flags);
2734
2735 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2736
2737 if (Subtarget->hasFastFMAF32())
2738 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2739 Flags);
2740 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2741 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2742 }
2743 }
2744
2745 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2746 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2747
2748 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2749 Flags);
2750}
2751
2753 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2754 // If we have to handle denormals, scale up the input and adjust the result.
2755
2756 SDLoc SL(Op);
2757 EVT VT = Op.getValueType();
2758 SDValue Src = Op.getOperand(0);
2759 SDNodeFlags Flags = Op->getFlags();
2760
2761 if (VT == MVT::f16) {
2762 // Nothing in half is a denormal when promoted to f32.
2763 assert(!Subtarget->has16BitInsts());
2764 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2765 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2766 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2767 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2768 }
2769
2770 assert(VT == MVT::f32);
2771
2772 if (!needsDenormHandlingF32(DAG, Src, Flags))
2773 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2774
2775 // bool needs_scaling = x < -0x1.f80000p+6f;
2776 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2777
2778 // -nextafter(128.0, -1)
2779 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2780
2781 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2782
2783 SDValue NeedsScaling =
2784 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2785
2786 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2787 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2788
2789 SDValue AddOffset =
2790 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2791
2792 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2793 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2794
2795 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2796 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2797 SDValue ResultScale =
2798 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2799
2800 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2801}
2802
2804 SelectionDAG &DAG,
2805 SDNodeFlags Flags) const {
2806 EVT VT = X.getValueType();
2807 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2808
2809 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2810 // exp2(M_LOG2E_F * f);
2811 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2812 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2813 : (unsigned)ISD::FEXP2,
2814 SL, VT, Mul, Flags);
2815 }
2816
2817 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2818
2819 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2820 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2821
2822 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2823
2824 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2825
2826 SDValue AdjustedX =
2827 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2828
2829 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2830
2831 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2832
2833 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2834 SDValue AdjustedResult =
2835 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2836
2837 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2838 Flags);
2839}
2840
2842 EVT VT = Op.getValueType();
2843 SDLoc SL(Op);
2844 SDValue X = Op.getOperand(0);
2845 SDNodeFlags Flags = Op->getFlags();
2846 const bool IsExp10 = false; // TODO: For some reason exp10 is missing
2847
2848 if (VT.getScalarType() == MVT::f16) {
2849 // v_exp_f16 (fmul x, log2e)
2850 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
2851 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2852
2853 if (VT.isVector())
2854 return SDValue();
2855
2856 // exp(f16 x) ->
2857 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
2858
2859 // Nothing in half is a denormal when promoted to f32.
2860 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
2861 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
2862 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
2863 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2864 }
2865
2866 assert(VT == MVT::f32);
2867
2868 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
2869 // library behavior. Also, is known-not-daz source sufficient?
2870 if (allowApproxFunc(DAG, Flags)) {
2871 assert(!IsExp10 && "todo exp10 support");
2872 return lowerFEXPUnsafe(X, SL, DAG, Flags);
2873 }
2874
2875 // Algorithm:
2876 //
2877 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
2878 //
2879 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
2880 // n = 64*m + j, 0 <= j < 64
2881 //
2882 // e^x = 2^((64*m + j + f)/64)
2883 // = (2^m) * (2^(j/64)) * 2^(f/64)
2884 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
2885 //
2886 // f = x*(64/ln(2)) - n
2887 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
2888 //
2889 // e^x = (2^m) * (2^(j/64)) * e^r
2890 //
2891 // (2^(j/64)) is precomputed
2892 //
2893 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2894 // e^r = 1 + q
2895 //
2896 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
2897 //
2898 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
2899 SDNodeFlags FlagsNoContract = Flags;
2900 FlagsNoContract.setAllowContract(false);
2901
2902 SDValue PH, PL;
2903 if (Subtarget->hasFastFMAF32()) {
2904 const float c_exp = numbers::log2ef;
2905 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
2906 const float c_exp10 = 0x1.a934f0p+1f;
2907 const float cc_exp10 = 0x1.2f346ep-24f;
2908
2909 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
2910 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
2911
2912 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
2913 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
2914 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
2915 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
2916 } else {
2917 const float ch_exp = 0x1.714000p+0f;
2918 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
2919
2920 const float ch_exp10 = 0x1.a92000p+1f;
2921 const float cl_exp10 = 0x1.4f0978p-11f;
2922
2923 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
2924 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
2925
2926 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
2927 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
2928 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
2929 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
2930 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
2931
2932 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
2933
2934 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
2935 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
2936 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
2937 }
2938
2939 SDValue E = DAG.getNode(ISD::FRINT, SL, VT, PH, Flags);
2940
2941 // It is unsafe to contract this fsub into the PH multiply.
2942 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
2943
2944 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
2945 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
2946 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
2947
2948 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
2949
2950 SDValue UnderflowCheckConst =
2951 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
2952
2953 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2954 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2955 SDValue Underflow =
2956 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
2957
2958 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
2959 const auto &Options = getTargetMachine().Options;
2960
2961 if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
2962 SDValue OverflowCheckConst =
2963 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
2964 SDValue Overflow =
2965 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
2966 SDValue Inf =
2968 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
2969 }
2970
2971 return R;
2972}
2973
2974static bool isCtlzOpc(unsigned Opc) {
2975 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2976}
2977
2978static bool isCttzOpc(unsigned Opc) {
2979 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2980}
2981
2983 SDLoc SL(Op);
2984 SDValue Src = Op.getOperand(0);
2985
2986 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2987 bool Ctlz = isCtlzOpc(Op.getOpcode());
2988 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2989
2990 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2991 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2992
2993 if (Src.getValueType() == MVT::i32) {
2994 // (ctlz hi:lo) -> (umin (ffbh src), 32)
2995 // (cttz hi:lo) -> (umin (ffbl src), 32)
2996 // (ctlz_zero_undef src) -> (ffbh src)
2997 // (cttz_zero_undef src) -> (ffbl src)
2998 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2999 if (!ZeroUndef) {
3000 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3001 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
3002 }
3003 return NewOpr;
3004 }
3005
3006 SDValue Lo, Hi;
3007 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3008
3009 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3010 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3011
3012 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3013 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3014 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3015 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3016
3017 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3018 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3019 if (Ctlz)
3020 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3021 else
3022 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3023
3024 SDValue NewOpr;
3025 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3026 if (!ZeroUndef) {
3027 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3028 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3029 }
3030
3031 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3032}
3033
3035 bool Signed) const {
3036 // The regular method converting a 64-bit integer to float roughly consists of
3037 // 2 steps: normalization and rounding. In fact, after normalization, the
3038 // conversion from a 64-bit integer to a float is essentially the same as the
3039 // one from a 32-bit integer. The only difference is that it has more
3040 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3041 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3042 // converted into the correct float number. The basic steps for the unsigned
3043 // conversion are illustrated in the following pseudo code:
3044 //
3045 // f32 uitofp(i64 u) {
3046 // i32 hi, lo = split(u);
3047 // // Only count the leading zeros in hi as we have native support of the
3048 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3049 // // reduced to a 32-bit one automatically.
3050 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3051 // u <<= shamt;
3052 // hi, lo = split(u);
3053 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3054 // // convert it as a 32-bit integer and scale the result back.
3055 // return uitofp(hi) * 2^(32 - shamt);
3056 // }
3057 //
3058 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3059 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3060 // converted instead followed by negation based its sign bit.
3061
3062 SDLoc SL(Op);
3063 SDValue Src = Op.getOperand(0);
3064
3065 SDValue Lo, Hi;
3066 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3067 SDValue Sign;
3068 SDValue ShAmt;
3069 if (Signed && Subtarget->isGCN()) {
3070 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3071 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3072 // account. That is, the maximal shift is
3073 // - 32 if Lo and Hi have opposite signs;
3074 // - 33 if Lo and Hi have the same sign.
3075 //
3076 // Or, MaxShAmt = 33 + OppositeSign, where
3077 //
3078 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3079 // - -1 if Lo and Hi have opposite signs; and
3080 // - 0 otherwise.
3081 //
3082 // All in all, ShAmt is calculated as
3083 //
3084 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3085 //
3086 // or
3087 //
3088 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3089 //
3090 // to reduce the critical path.
3091 SDValue OppositeSign = DAG.getNode(
3092 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3093 DAG.getConstant(31, SL, MVT::i32));
3094 SDValue MaxShAmt =
3095 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3096 OppositeSign);
3097 // Count the leading sign bits.
3098 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3099 // Different from unsigned conversion, the shift should be one bit less to
3100 // preserve the sign bit.
3101 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3102 DAG.getConstant(1, SL, MVT::i32));
3103 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3104 } else {
3105 if (Signed) {
3106 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3107 // absolute value first.
3108 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3109 DAG.getConstant(63, SL, MVT::i64));
3110 SDValue Abs =
3111 DAG.getNode(ISD::XOR, SL, MVT::i64,
3112 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3113 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3114 }
3115 // Count the leading zeros.
3116 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3117 // The shift amount for signed integers is [0, 32].
3118 }
3119 // Normalize the given 64-bit integer.
3120 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3121 // Split it again.
3122 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3123 // Calculate the adjust bit for rounding.
3124 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3125 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3126 DAG.getConstant(1, SL, MVT::i32), Lo);
3127 // Get the 32-bit normalized integer.
3128 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3129 // Convert the normalized 32-bit integer into f32.
3130 unsigned Opc =
3131 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3132 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3133
3134 // Finally, need to scale back the converted floating number as the original
3135 // 64-bit integer is converted as a 32-bit one.
3136 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3137 ShAmt);
3138 // On GCN, use LDEXP directly.
3139 if (Subtarget->isGCN())
3140 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3141
3142 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3143 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3144 // exponent is enough to avoid overflowing into the sign bit.
3145 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3146 DAG.getConstant(23, SL, MVT::i32));
3147 SDValue IVal =
3148 DAG.getNode(ISD::ADD, SL, MVT::i32,
3149 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3150 if (Signed) {
3151 // Set the sign bit.
3152 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3153 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3154 DAG.getConstant(31, SL, MVT::i32));
3155 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3156 }
3157 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3158}
3159
3161 bool Signed) const {
3162 SDLoc SL(Op);
3163 SDValue Src = Op.getOperand(0);
3164
3165 SDValue Lo, Hi;
3166 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3167
3169 SL, MVT::f64, Hi);
3170
3171 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3172
3173 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3174 DAG.getConstant(32, SL, MVT::i32));
3175 // TODO: Should this propagate fast-math-flags?
3176 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3177}
3178
3180 SelectionDAG &DAG) const {
3181 // TODO: Factor out code common with LowerSINT_TO_FP.
3182 EVT DestVT = Op.getValueType();
3183 SDValue Src = Op.getOperand(0);
3184 EVT SrcVT = Src.getValueType();
3185
3186 if (SrcVT == MVT::i16) {
3187 if (DestVT == MVT::f16)
3188 return Op;
3189 SDLoc DL(Op);
3190
3191 // Promote src to i32
3192 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3193 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3194 }
3195
3196 assert(SrcVT == MVT::i64 && "operation should be legal");
3197
3198 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3199 SDLoc DL(Op);
3200
3201 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3202 SDValue FPRoundFlag =
3203 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3204 SDValue FPRound =
3205 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3206
3207 return FPRound;
3208 }
3209
3210 if (DestVT == MVT::f32)
3211 return LowerINT_TO_FP32(Op, DAG, false);
3212
3213 assert(DestVT == MVT::f64);
3214 return LowerINT_TO_FP64(Op, DAG, false);
3215}
3216
3218 SelectionDAG &DAG) const {
3219 EVT DestVT = Op.getValueType();
3220
3221 SDValue Src = Op.getOperand(0);
3222 EVT SrcVT = Src.getValueType();
3223
3224 if (SrcVT == MVT::i16) {
3225 if (DestVT == MVT::f16)
3226 return Op;
3227
3228 SDLoc DL(Op);
3229 // Promote src to i32
3230 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3231 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3232 }
3233
3234 assert(SrcVT == MVT::i64 && "operation should be legal");
3235
3236 // TODO: Factor out code common with LowerUINT_TO_FP.
3237
3238 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3239 SDLoc DL(Op);
3240 SDValue Src = Op.getOperand(0);
3241
3242 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3243 SDValue FPRoundFlag =
3244 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3245 SDValue FPRound =
3246 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3247
3248 return FPRound;
3249 }
3250
3251 if (DestVT == MVT::f32)
3252 return LowerINT_TO_FP32(Op, DAG, true);
3253
3254 assert(DestVT == MVT::f64);
3255 return LowerINT_TO_FP64(Op, DAG, true);
3256}
3257
3259 bool Signed) const {
3260 SDLoc SL(Op);
3261
3262 SDValue Src = Op.getOperand(0);
3263 EVT SrcVT = Src.getValueType();
3264
3265 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3266
3267 // The basic idea of converting a floating point number into a pair of 32-bit
3268 // integers is illustrated as follows:
3269 //
3270 // tf := trunc(val);
3271 // hif := floor(tf * 2^-32);
3272 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3273 // hi := fptoi(hif);
3274 // lo := fptoi(lof);
3275 //
3276 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3277 SDValue Sign;
3278 if (Signed && SrcVT == MVT::f32) {
3279 // However, a 32-bit floating point number has only 23 bits mantissa and
3280 // it's not enough to hold all the significant bits of `lof` if val is
3281 // negative. To avoid the loss of precision, We need to take the absolute
3282 // value after truncating and flip the result back based on the original
3283 // signedness.
3284 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3285 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3286 DAG.getConstant(31, SL, MVT::i32));
3287 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3288 }
3289
3290 SDValue K0, K1;
3291 if (SrcVT == MVT::f64) {
3292 K0 = DAG.getConstantFP(
3293 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3294 SrcVT);
3295 K1 = DAG.getConstantFP(
3296 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3297 SrcVT);
3298 } else {
3299 K0 = DAG.getConstantFP(
3300 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3301 K1 = DAG.getConstantFP(
3302 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3303 }
3304 // TODO: Should this propagate fast-math-flags?
3305 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3306
3307 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3308
3309 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3310
3311 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3313 SL, MVT::i32, FloorMul);
3314 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3315
3316 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3317 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3318
3319 if (Signed && SrcVT == MVT::f32) {
3320 assert(Sign);
3321 // Flip the result based on the signedness, which is either all 0s or 1s.
3322 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3323 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3324 // r := xor(r, sign) - sign;
3325 Result =
3326 DAG.getNode(ISD::SUB, SL, MVT::i64,
3327 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3328 }
3329
3330 return Result;
3331}
3332
3334 SDLoc DL(Op);
3335 SDValue N0 = Op.getOperand(0);
3336
3337 // Convert to target node to get known bits
3338 if (N0.getValueType() == MVT::f32)
3339 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3340
3341 if (getTargetMachine().Options.UnsafeFPMath) {
3342 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3343 return SDValue();
3344 }
3345
3346 assert(N0.getSimpleValueType() == MVT::f64);
3347
3348 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3349 const unsigned ExpMask = 0x7ff;
3350 const unsigned ExpBiasf64 = 1023;
3351 const unsigned ExpBiasf16 = 15;
3352 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3353 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3354 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
3355 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3356 DAG.getConstant(32, DL, MVT::i64));
3357 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3358 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3359 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3360 DAG.getConstant(20, DL, MVT::i64));
3361 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3362 DAG.getConstant(ExpMask, DL, MVT::i32));
3363 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3364 // add the f16 bias (15) to get the biased exponent for the f16 format.
3365 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3366 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3367
3368 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3369 DAG.getConstant(8, DL, MVT::i32));
3370 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3371 DAG.getConstant(0xffe, DL, MVT::i32));
3372
3373 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3374 DAG.getConstant(0x1ff, DL, MVT::i32));
3375 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3376
3377 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3378 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3379
3380 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3381 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3382 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3383 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3384
3385 // N = M | (E << 12);
3386 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3387 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3388 DAG.getConstant(12, DL, MVT::i32)));
3389
3390 // B = clamp(1-E, 0, 13);
3391 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3392 One, E);
3393 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3394 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3395 DAG.getConstant(13, DL, MVT::i32));
3396
3397 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3398 DAG.getConstant(0x1000, DL, MVT::i32));
3399
3400 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3401 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3402 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3403 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3404
3405 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3406 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3407 DAG.getConstant(0x7, DL, MVT::i32));
3408 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3409 DAG.getConstant(2, DL, MVT::i32));
3410 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3411 One, Zero, ISD::SETEQ);
3412 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3413 One, Zero, ISD::SETGT);
3414 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3415 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3416
3417 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3418 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3419 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3420 I, V, ISD::SETEQ);
3421
3422 // Extract the sign bit.
3423 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3424 DAG.getConstant(16, DL, MVT::i32));
3425 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3426 DAG.getConstant(0x8000, DL, MVT::i32));
3427
3428 V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3429 return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
3430}
3431
3433 SelectionDAG &DAG) const {
3434 SDValue Src = Op.getOperand(0);
3435 unsigned OpOpcode = Op.getOpcode();
3436 EVT SrcVT = Src.getValueType();
3437 EVT DestVT = Op.getValueType();
3438
3439 // Will be selected natively
3440 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3441 return Op;
3442
3443 // Promote i16 to i32
3444 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3445 SDLoc DL(Op);
3446
3447 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3448 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3449 }
3450
3451 if (SrcVT == MVT::f16 ||
3452 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3453 SDLoc DL(Op);
3454
3455 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3456 unsigned Ext =
3458 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3459 }
3460
3461 if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3462 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3463
3464 return SDValue();
3465}
3466
3468 SelectionDAG &DAG) const {
3469 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3470 MVT VT = Op.getSimpleValueType();
3471 MVT ScalarVT = VT.getScalarType();
3472
3473 assert(VT.isVector());
3474
3475 SDValue Src = Op.getOperand(0);
3476 SDLoc DL(Op);
3477
3478 // TODO: Don't scalarize on Evergreen?
3479 unsigned NElts = VT.getVectorNumElements();
3481 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3482
3483 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3484 for (unsigned I = 0; I < NElts; ++I)
3485 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3486
3487 return DAG.getBuildVector(VT, DL, Args);
3488}
3489
3490//===----------------------------------------------------------------------===//
3491// Custom DAG optimizations
3492//===----------------------------------------------------------------------===//
3493
3494static bool isU24(SDValue Op, SelectionDAG &DAG) {
3495 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3496}
3497
3498static bool isI24(SDValue Op, SelectionDAG &DAG) {
3499 EVT VT = Op.getValueType();
3500 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3501 // as unsigned 24-bit values.
3503}
3504
3507 SelectionDAG &DAG = DCI.DAG;
3508 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3509 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3510
3511 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3512 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3513 unsigned NewOpcode = Node24->getOpcode();
3514 if (IsIntrin) {
3515 unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
3516 switch (IID) {
3517 case Intrinsic::amdgcn_mul_i24:
3518 NewOpcode = AMDGPUISD::MUL_I24;
3519 break;
3520 case Intrinsic::amdgcn_mul_u24:
3521 NewOpcode = AMDGPUISD::MUL_U24;
3522 break;
3523 case Intrinsic::amdgcn_mulhi_i24:
3524 NewOpcode = AMDGPUISD::MULHI_I24;
3525 break;
3526 case Intrinsic::amdgcn_mulhi_u24:
3527 NewOpcode = AMDGPUISD::MULHI_U24;
3528 break;
3529 default:
3530 llvm_unreachable("Expected 24-bit mul intrinsic");
3531 }
3532 }
3533
3534 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3535
3536 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3537 // the operands to have other uses, but will only perform simplifications that
3538 // involve bypassing some nodes for this user.
3539 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3540 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3541 if (DemandedLHS || DemandedRHS)
3542 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3543 DemandedLHS ? DemandedLHS : LHS,
3544 DemandedRHS ? DemandedRHS : RHS);
3545
3546 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3547 // operands if this node is the only user.
3548 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3549 return SDValue(Node24, 0);
3550 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3551 return SDValue(Node24, 0);
3552
3553 return SDValue();
3554}
3555
3556template <typename IntTy>
3558 uint32_t Width, const SDLoc &DL) {
3559 if (Width + Offset < 32) {
3560 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3561 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3562 return DAG.getConstant(Result, DL, MVT::i32);
3563 }
3564
3565 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3566}
3567
3568static bool hasVolatileUser(SDNode *Val) {
3569 for (SDNode *U : Val->uses()) {
3570 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3571 if (M->isVolatile())
3572 return true;
3573 }
3574 }
3575
3576 return false;
3577}
3578
3580 // i32 vectors are the canonical memory type.
3581 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3582 return false;
3583
3584 if (!VT.isByteSized())
3585 return false;
3586
3587 unsigned Size = VT.getStoreSize();
3588
3589 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3590 return false;
3591
3592 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3593 return false;
3594
3595 return true;
3596}
3597
3598// Replace load of an illegal type with a store of a bitcast to a friendlier
3599// type.
3601 DAGCombinerInfo &DCI) const {
3602 if (!DCI.isBeforeLegalize())
3603 return SDValue();
3604
3605 LoadSDNode *LN = cast<LoadSDNode>(N);
3606 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3607 return SDValue();
3608
3609 SDLoc SL(N);
3610 SelectionDAG &DAG = DCI.DAG;
3611 EVT VT = LN->getMemoryVT();
3612
3613 unsigned Size = VT.getStoreSize();
3614 Align Alignment = LN->getAlign();
3615 if (Alignment < Size && isTypeLegal(VT)) {
3616 unsigned IsFast;
3617 unsigned AS = LN->getAddressSpace();
3618
3619 // Expand unaligned loads earlier than legalization. Due to visitation order
3620 // problems during legalization, the emitted instructions to pack and unpack
3621 // the bytes again are not eliminated in the case of an unaligned copy.
3623 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3624 if (VT.isVector())
3625 return SplitVectorLoad(SDValue(LN, 0), DAG);
3626
3627 SDValue Ops[2];
3628 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3629
3630 return DAG.getMergeValues(Ops, SDLoc(N));
3631 }
3632
3633 if (!IsFast)
3634 return SDValue();
3635 }
3636
3637 if (!shouldCombineMemoryType(VT))
3638 return SDValue();
3639
3640 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3641
3642 SDValue NewLoad
3643 = DAG.getLoad(NewVT, SL, LN->getChain(),
3644 LN->getBasePtr(), LN->getMemOperand());
3645
3646 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3647 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3648 return SDValue(N, 0);
3649}
3650
3651// Replace store of an illegal type with a store of a bitcast to a friendlier
3652// type.
3654 DAGCombinerInfo &DCI) const {
3655 if (!DCI.isBeforeLegalize())
3656 return SDValue();
3657
3658 StoreSDNode *SN = cast<StoreSDNode>(N);
3659 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3660 return SDValue();
3661
3662 EVT VT = SN->getMemoryVT();
3663 unsigned Size = VT.getStoreSize();
3664
3665 SDLoc SL(N);
3666 SelectionDAG &DAG = DCI.DAG;
3667 Align Alignment = SN->getAlign();
3668 if (Alignment < Size && isTypeLegal(VT)) {
3669 unsigned IsFast;
3670 unsigned AS = SN->getAddressSpace();
3671
3672 // Expand unaligned stores earlier than legalization. Due to visitation
3673 // order problems during legalization, the emitted instructions to pack and
3674 // unpack the bytes again are not eliminated in the case of an unaligned
3675 // copy.
3677 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3678 if (VT.isVector())
3679 return SplitVectorStore(SDValue(SN, 0), DAG);
3680
3681 return expandUnalignedStore(SN, DAG);
3682 }
3683
3684 if (!IsFast)
3685 return SDValue();
3686 }
3687
3688 if (!shouldCombineMemoryType(VT))
3689 return SDValue();
3690
3691 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3692 SDValue Val = SN->getValue();
3693
3694 //DCI.AddToWorklist(Val.getNode());
3695
3696 bool OtherUses = !Val.hasOneUse();
3697 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3698 if (OtherUses) {
3699 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3700 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3701 }
3702
3703 return DAG.getStore(SN->getChain(), SL, CastVal,
3704 SN->getBasePtr(), SN->getMemOperand());
3705}
3706
3707// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3708// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3709// issues.
3711 DAGCombinerInfo &DCI) const {
3712 SelectionDAG &DAG = DCI.DAG;
3713 SDValue N0 = N->getOperand(0);
3714
3715 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3716 // (vt2 (truncate (assertzext vt0:x, vt1)))
3717 if (N0.getOpcode() == ISD::TRUNCATE) {
3718 SDValue N1 = N->getOperand(1);
3719 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3720 SDLoc SL(N);
3721
3722 SDValue Src = N0.getOperand(0);
3723 EVT SrcVT = Src.getValueType();
3724 if (SrcVT.bitsGE(ExtVT)) {
3725 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3726 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3727 }
3728 }
3729
3730 return SDValue();
3731}
3732
3734 SDNode *N, DAGCombinerInfo &DCI) const {
3735 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3736 switch (IID) {
3737 case Intrinsic::amdgcn_mul_i24:
3738 case Intrinsic::amdgcn_mul_u24:
3739 case Intrinsic::amdgcn_mulhi_i24:
3740 case Intrinsic::amdgcn_mulhi_u24:
3741 return simplifyMul24(N, DCI);
3742 case Intrinsic::amdgcn_fract:
3743 case Intrinsic::amdgcn_rsq:
3744 case Intrinsic::amdgcn_rcp_legacy:
3745 case Intrinsic::amdgcn_rsq_legacy:
3746 case Intrinsic::amdgcn_rsq_clamp: {
3747 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3748 SDValue Src = N->getOperand(1);
3749 return Src.isUndef() ? Src : SDValue();
3750 }
3751 case Intrinsic::amdgcn_frexp_exp: {
3752 // frexp_exp (fneg x) -> frexp_exp x
3753 // frexp_exp (fabs x) -> frexp_exp x
3754 // frexp_exp (fneg (fabs x)) -> frexp_exp x
3755 SDValue Src = N->getOperand(1);
3756 SDValue PeekSign = peekFPSignOps(Src);
3757 if (PeekSign == Src)
3758 return SDValue();
3759 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
3760 0);
3761 }
3762 default:
3763 return SDValue();
3764 }
3765}
3766
3767/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3768/// binary operation \p Opc to it with the corresponding constant operands.
3770 DAGCombinerInfo &DCI, const SDLoc &SL,
3771 unsigned Opc, SDValue LHS,
3772 uint32_t ValLo, uint32_t ValHi) const {
3773 SelectionDAG &DAG = DCI.DAG;
3774 SDValue Lo, Hi;
3775 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3776
3777 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3778 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3779
3780 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3781 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3782
3783 // Re-visit the ands. It's possible we eliminated one of them and it could
3784 // simplify the vector.
3785 DCI.AddToWorklist(Lo.getNode());
3786 DCI.AddToWorklist(Hi.getNode());
3787
3788 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3789 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3790}
3791
3793 DAGCombinerInfo &DCI) const {
3794 EVT VT = N->getValueType(0);
3795
3796 ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3797 if (!RHS)
3798 return SDValue();
3799
3800 SDValue LHS = N->getOperand(0);
3801 unsigned RHSVal = RHS->getZExtValue();
3802 if (!RHSVal)
3803 return LHS;
3804
3805 SDLoc SL(N);
3806 SelectionDAG &DAG = DCI.DAG;
3807
3808 switch (LHS->getOpcode()) {
3809 default:
3810 break;
3811 case ISD::ZERO_EXTEND:
3812 case ISD::SIGN_EXTEND:
3813 case ISD::ANY_EXTEND: {
3814 SDValue X = LHS->getOperand(0);
3815
3816 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3817 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3818 // Prefer build_vector as the canonical form if packed types are legal.
3819 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3820 SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3821 { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3822 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3823 }
3824
3825 // shl (ext x) => zext (shl x), if shift does not overflow int
3826 if (VT != MVT::i64)
3827 break;
3828 KnownBits Known = DAG.computeKnownBits(X);
3829 unsigned LZ = Known.countMinLeadingZeros();
3830 if (LZ < RHSVal)
3831 break;
3832 EVT XVT = X.getValueType();
3833 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3834 return DAG.getZExtOrTrunc(Shl, SL, VT);
3835 }
3836 }
3837
3838 if (VT != MVT::i64)
3839 return SDValue();
3840
3841 // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3842
3843 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3844 // common case, splitting this into a move and a 32-bit shift is faster and
3845 // the same code size.
3846 if (RHSVal < 32)
3847 return SDValue();
3848
3849 SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3850
3851 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3852 SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3853
3854 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3855
3856 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3857 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3858}
3859
3861 DAGCombinerInfo &DCI) const {
3862 if (N->getValueType(0) != MVT::i64)
3863 return SDValue();
3864
3865 const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3866 if (!RHS)
3867 return SDValue();
3868
3869 SelectionDAG &DAG = DCI.DAG;
3870 SDLoc SL(N);
3871 unsigned RHSVal = RHS->getZExtValue();
3872
3873 // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3874 if (RHSVal == 32) {
3875 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3876 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3877 DAG.getConstant(31, SL, MVT::i32));
3878
3879 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3880 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3881 }
3882
3883 // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3884 if (RHSVal == 63) {
3885 SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3886 SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3887 DAG.getConstant(31, SL, MVT::i32));
3888 SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3889 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3890 }
3891
3892 return SDValue();
3893}
3894
3896 DAGCombinerInfo &DCI) const {
3897 auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3898 if (!RHS)
3899 return SDValue();
3900
3901 EVT VT = N->getValueType(0);
3902 SDValue LHS = N->getOperand(0);
3903 unsigned ShiftAmt = RHS->getZExtValue();
3904